try fix windows again
Created using spr 1.3.6
This commit is contained in:
commit
2ac7fe0c72
149
.github/workflows/llvm-project-tests.yml
vendored
149
.github/workflows/llvm-project-tests.yml
vendored
@ -1,149 +0,0 @@
|
||||
name: LLVM Project Tests
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
build_target:
|
||||
required: false
|
||||
projects:
|
||||
required: false
|
||||
extra_cmake_args:
|
||||
required: false
|
||||
os_list:
|
||||
required: false
|
||||
default: '["ubuntu-24.04", "windows-2019", "macOS-13"]'
|
||||
python_version:
|
||||
required: false
|
||||
type: string
|
||||
default: '3.11'
|
||||
workflow_call:
|
||||
inputs:
|
||||
build_target:
|
||||
required: false
|
||||
type: string
|
||||
default: "all"
|
||||
|
||||
projects:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
extra_cmake_args:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
os_list:
|
||||
required: false
|
||||
type: string
|
||||
# Use windows-2019 due to:
|
||||
# https://developercommunity.visualstudio.com/t/Prev-Issue---with-__assume-isnan-/1597317
|
||||
default: '["ubuntu-24.04", "windows-2019", "macOS-13"]'
|
||||
|
||||
python_version:
|
||||
required: false
|
||||
type: string
|
||||
default: '3.11'
|
||||
|
||||
concurrency:
|
||||
# Skip intermediate builds: always.
|
||||
# Cancel intermediate builds: only if it is a pull request build.
|
||||
# If the group name here is the same as the group name in the workflow that includes
|
||||
# this one, then the action will try to wait on itself and get stuck.
|
||||
group: llvm-project-${{ github.workflow }}-${{ inputs.projects }}-${{ inputs.python_version }}${{ github.ref }}
|
||||
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
|
||||
|
||||
jobs:
|
||||
lit-tests:
|
||||
name: Lit Tests
|
||||
runs-on: ${{ matrix.os }}
|
||||
container:
|
||||
image: ${{(startsWith(matrix.os, 'ubuntu') && 'ghcr.io/llvm/ci-ubuntu-24.04:latest') || null}}
|
||||
volumes:
|
||||
- /mnt/:/mnt/
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: ${{ fromJSON(inputs.os_list) }}
|
||||
steps:
|
||||
- name: Setup Windows
|
||||
if: startsWith(matrix.os, 'windows')
|
||||
uses: llvm/actions/setup-windows@main
|
||||
with:
|
||||
arch: amd64
|
||||
# On Windows, starting with win19/20220814.1, cmake choose the 32-bit
|
||||
# python3.10.6 libraries instead of the 64-bit libraries when building
|
||||
# lldb. Using this setup-python action to make 3.10 the default
|
||||
# python fixes this.
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||
with:
|
||||
python-version: ${{ inputs.python_version }}
|
||||
- name: Install Ninja
|
||||
if: runner.os != 'Linux'
|
||||
uses: llvm/actions/install-ninja@main
|
||||
# actions/checkout deletes any existing files in the new git directory,
|
||||
# so this needs to either run before ccache-action or it has to use
|
||||
# clean: false.
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 250
|
||||
- name: Setup ccache
|
||||
uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
|
||||
with:
|
||||
# A full build of llvm, clang, lld, and lldb takes about 250MB
|
||||
# of ccache space. There's not much reason to have more than this,
|
||||
# because we usually won't need to save cache entries from older
|
||||
# builds. Also, there is an overall 10GB cache limit, and each
|
||||
# run creates a new cache entry so we want to ensure that we have
|
||||
# enough cache space for all the tests to run at once and still
|
||||
# fit under the 10 GB limit.
|
||||
# Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
|
||||
max-size: 2G
|
||||
key: ${{ matrix.os }}
|
||||
variant: sccache
|
||||
- name: Build and Test
|
||||
env:
|
||||
# Workaround for https://github.com/actions/virtual-environments/issues/5900.
|
||||
# This should be a no-op for non-mac OSes
|
||||
PKG_CONFIG_PATH: /usr/local/Homebrew/Library/Homebrew/os/mac/pkgconfig//12
|
||||
shell: bash
|
||||
id: build-llvm
|
||||
run: |
|
||||
if [ "${{ runner.os }}" == "Linux" ]; then
|
||||
builddir="/mnt/build/"
|
||||
sudo mkdir -p $builddir
|
||||
sudo chown gha $builddir
|
||||
extra_cmake_args="-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang"
|
||||
else
|
||||
builddir="$(pwd)"/build
|
||||
fi
|
||||
if [ "${{ runner.os }}" == "macOS" ]; then
|
||||
# Workaround test failure on some lld tests on MacOS
|
||||
# https://github.com/llvm/llvm-project/issues/81967
|
||||
extra_cmake_args="-DLLVM_DISABLE_ASSEMBLY_FILES=ON"
|
||||
fi
|
||||
echo "llvm-builddir=$builddir" >> "$GITHUB_OUTPUT"
|
||||
cmake -G Ninja \
|
||||
-B "$builddir" \
|
||||
-S llvm \
|
||||
-DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DLLVM_ENABLE_ASSERTIONS=ON \
|
||||
-DLLDB_INCLUDE_TESTS=OFF \
|
||||
-DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl" \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=sccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
|
||||
$extra_cmake_args \
|
||||
${{ inputs.extra_cmake_args }}
|
||||
ninja -C "$builddir" '${{ inputs.build_target }}'
|
||||
|
||||
- name: Build and Test libclc
|
||||
if: "!startsWith(matrix.os, 'windows') && contains(inputs.projects, 'libclc')"
|
||||
env:
|
||||
LLVM_BUILDDIR: ${{ steps.build-llvm.outputs.llvm-builddir }}
|
||||
run: |
|
||||
# The libclc tests don't have a generated check target so all we can
|
||||
# do is build it.
|
||||
ninja -C "$LLVM_BUILDDIR"
|
||||
@ -1,32 +0,0 @@
|
||||
# This workflow will test the llvm-project-tests workflow in PRs
|
||||
# targetting the main branch. Since this workflow doesn't normally
|
||||
# run on main PRs, we need some way to test it to ensure new updates
|
||||
# don't break it.
|
||||
|
||||
name: LLVM Workflow Test
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- 'main'
|
||||
paths:
|
||||
- '.github/workflows/llvm-project-tests.yml'
|
||||
- '.github/workflows/llvm-project-workflow-tests.yml'
|
||||
|
||||
concurrency:
|
||||
# Skip intermediate builds: always.
|
||||
# Cancel intermediate builds: only if it is a pull request build.
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
|
||||
|
||||
jobs:
|
||||
llvm-test:
|
||||
if: github.repository_owner == 'llvm'
|
||||
name: Build and Test
|
||||
uses: ./.github/workflows/llvm-project-tests.yml
|
||||
with:
|
||||
build_target: check-all
|
||||
projects: clang;lld;libclc;lldb
|
||||
@ -581,6 +581,13 @@ def err_drv_reduced_module_output_overrided : Warning<
|
||||
"please consider use '-fmodule-output=' to specify the output file for reduced BMI explicitly">,
|
||||
InGroup<DiagGroup<"reduced-bmi-output-overrided">>;
|
||||
|
||||
def remark_found_cxx20_module_usage : Remark<
|
||||
"found C++20 module usage in file '%0'">,
|
||||
InGroup<ModulesDriver>;
|
||||
def remark_performing_driver_managed_module_build : Remark<
|
||||
"performing driver managed module build">,
|
||||
InGroup<ModulesDriver>;
|
||||
|
||||
def warn_drv_delayed_template_parsing_after_cxx20 : Warning<
|
||||
"-fdelayed-template-parsing is deprecated after C++20">,
|
||||
InGroup<DiagGroup<"delayed-template-parsing-in-cxx20">>;
|
||||
|
||||
@ -635,6 +635,7 @@ def ModuleConflict : DiagGroup<"module-conflict">;
|
||||
def ModuleFileExtension : DiagGroup<"module-file-extension">;
|
||||
def ModuleIncludeDirectiveTranslation : DiagGroup<"module-include-translation">;
|
||||
def ModuleMap : DiagGroup<"module-map">;
|
||||
def ModulesDriver : DiagGroup<"modules-driver">;
|
||||
def RoundTripCC1Args : DiagGroup<"round-trip-cc1-args">;
|
||||
def NewlineEOF : DiagGroup<"newline-eof">;
|
||||
def Nullability : DiagGroup<"nullability">;
|
||||
|
||||
@ -512,6 +512,9 @@ public:
|
||||
|
||||
/// BuildActions - Construct the list of actions to perform for the
|
||||
/// given arguments, which are only done for a single architecture.
|
||||
/// If the compilation is an explicit module build, delegates to
|
||||
/// BuildDriverManagedModuleBuildActions. Otherwise, BuildDefaultActions is
|
||||
/// used.
|
||||
///
|
||||
/// \param C - The compilation that is being built.
|
||||
/// \param Args - The input arguments.
|
||||
@ -796,6 +799,35 @@ private:
|
||||
/// compilation based on which -f(no-)?lto(=.*)? option occurs last.
|
||||
void setLTOMode(const llvm::opt::ArgList &Args);
|
||||
|
||||
/// BuildDefaultActions - Constructs the list of actions to perform
|
||||
/// for the provided arguments, which are only done for a single architecture.
|
||||
///
|
||||
/// \param C - The compilation that is being built.
|
||||
/// \param Args - The input arguments.
|
||||
/// \param Actions - The list to store the resulting actions onto.
|
||||
void BuildDefaultActions(Compilation &C, llvm::opt::DerivedArgList &Args,
|
||||
const InputList &Inputs, ActionList &Actions) const;
|
||||
|
||||
/// BuildDriverManagedModuleBuildActions - Performs a dependency
|
||||
/// scan and constructs the list of actions to perform for dependency order
|
||||
/// and the provided arguments. This is only done for a single a architecture.
|
||||
///
|
||||
/// \param C - The compilation that is being built.
|
||||
/// \param Args - The input arguments.
|
||||
/// \param Actions - The list to store the resulting actions onto.
|
||||
void BuildDriverManagedModuleBuildActions(Compilation &C,
|
||||
llvm::opt::DerivedArgList &Args,
|
||||
const InputList &Inputs,
|
||||
ActionList &Actions) const;
|
||||
|
||||
/// Scans the leading lines of the C++ source inputs to detect C++20 module
|
||||
/// usage.
|
||||
///
|
||||
/// \returns True if module usage is detected, false otherwise, or an error on
|
||||
/// read failure.
|
||||
llvm::ErrorOr<bool>
|
||||
ScanInputsForCXX20ModulesUsage(const InputList &Inputs) const;
|
||||
|
||||
/// Retrieves a ToolChain for a particular \p Target triple.
|
||||
///
|
||||
/// Will cache ToolChains for the life of the driver object, and create them
|
||||
|
||||
@ -3296,6 +3296,13 @@ defm modules_reduced_bmi : BoolOption<"f", "modules-reduced-bmi",
|
||||
PosFlag<SetTrue, [], [ClangOption, CC1Option],
|
||||
"Generate the reduced BMI">>;
|
||||
|
||||
def fmodules_driver : Flag<["-"], "fmodules-driver">,
|
||||
Group<f_Group>, Visibility<[ClangOption]>,
|
||||
HelpText<"Enable support for driver managed module builds (experimental)">;
|
||||
def fno_modules_driver : Flag<["-"], "fno-modules-driver">,
|
||||
Group<f_Group>, Visibility<[ClangOption]>,
|
||||
HelpText<"Disable support for driver managed module builds (experimental)">;
|
||||
|
||||
def experimental_modules_reduced_bmi : Flag<["-"], "fexperimental-modules-reduced-bmi">,
|
||||
Group<f_Group>, Visibility<[ClangOption, CC1Option]>, Alias<fmodules_reduced_bmi>;
|
||||
|
||||
|
||||
@ -135,6 +135,13 @@ void printDependencyDirectivesAsSource(
|
||||
ArrayRef<dependency_directives_scan::Directive> Directives,
|
||||
llvm::raw_ostream &OS);
|
||||
|
||||
/// Scan an input source buffer for C++20 named module usage.
|
||||
///
|
||||
/// \param Source The input source buffer.
|
||||
///
|
||||
/// \returns true if any C++20 named modules related directive was found.
|
||||
bool scanInputForCXX20ModulesUsage(StringRef Source);
|
||||
|
||||
/// Functor that returns the dependency directives for a given file.
|
||||
class DependencyDirectivesGetter {
|
||||
public:
|
||||
|
||||
@ -98,5 +98,6 @@ add_clang_library(clangDriver
|
||||
|
||||
LINK_LIBS
|
||||
clangBasic
|
||||
clangLex
|
||||
${system_libs}
|
||||
)
|
||||
|
||||
@ -66,6 +66,7 @@
|
||||
#include "clang/Driver/Tool.h"
|
||||
#include "clang/Driver/ToolChain.h"
|
||||
#include "clang/Driver/Types.h"
|
||||
#include "clang/Lex/DependencyDirectivesScanner.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
@ -4188,6 +4189,11 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args,
|
||||
YcArg = nullptr;
|
||||
}
|
||||
|
||||
if (Args.hasArgNoClaim(options::OPT_fmodules_driver))
|
||||
// TODO: Check against all incompatible -fmodules-driver arguments
|
||||
if (!ModulesModeCXX20 && !Args.hasArgNoClaim(options::OPT_fmodules))
|
||||
Args.eraseArg(options::OPT_fmodules_driver);
|
||||
|
||||
Arg *FinalPhaseArg;
|
||||
phases::ID FinalPhase = getFinalPhase(Args, &FinalPhaseArg);
|
||||
|
||||
@ -4314,6 +4320,33 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args,
|
||||
}
|
||||
}
|
||||
|
||||
static bool hasCXXModuleInputType(const Driver::InputList &Inputs) {
|
||||
const auto IsTypeCXXModule = [](const auto &Input) -> bool {
|
||||
const auto TypeID = Input.first;
|
||||
return (TypeID == types::TY_CXXModule);
|
||||
};
|
||||
return llvm::any_of(Inputs, IsTypeCXXModule);
|
||||
}
|
||||
|
||||
llvm::ErrorOr<bool>
|
||||
Driver::ScanInputsForCXX20ModulesUsage(const InputList &Inputs) const {
|
||||
const auto CXXInputs = llvm::make_filter_range(
|
||||
Inputs, [](const auto &Input) { return types::isCXX(Input.first); });
|
||||
for (const auto &Input : CXXInputs) {
|
||||
StringRef Filename = Input.second->getSpelling();
|
||||
auto ErrOrBuffer = VFS->getBufferForFile(Filename);
|
||||
if (!ErrOrBuffer)
|
||||
return ErrOrBuffer.getError();
|
||||
const auto Buffer = std::move(*ErrOrBuffer);
|
||||
|
||||
if (scanInputForCXX20ModulesUsage(Buffer->getBuffer())) {
|
||||
Diags.Report(diag::remark_found_cxx20_module_usage) << Filename;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
|
||||
const InputList &Inputs, ActionList &Actions) const {
|
||||
llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
|
||||
@ -4325,6 +4358,33 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
|
||||
|
||||
handleArguments(C, Args, Inputs, Actions);
|
||||
|
||||
if (Args.hasFlag(options::OPT_fmodules_driver,
|
||||
options::OPT_fno_modules_driver, false)) {
|
||||
// TODO: Move the logic for implicitly enabling explicit-module-builds out
|
||||
// of -fmodules-driver once it is no longer experimental.
|
||||
// Currently, this serves diagnostic purposes only.
|
||||
bool UsesCXXModules = hasCXXModuleInputType(Inputs);
|
||||
if (!UsesCXXModules) {
|
||||
const auto ErrOrScanResult = ScanInputsForCXX20ModulesUsage(Inputs);
|
||||
if (!ErrOrScanResult) {
|
||||
Diags.Report(diag::err_cannot_open_file)
|
||||
<< ErrOrScanResult.getError().message();
|
||||
return;
|
||||
}
|
||||
UsesCXXModules = *ErrOrScanResult;
|
||||
}
|
||||
if (UsesCXXModules || Args.hasArg(options::OPT_fmodules))
|
||||
BuildDriverManagedModuleBuildActions(C, Args, Inputs, Actions);
|
||||
return;
|
||||
}
|
||||
|
||||
BuildDefaultActions(C, Args, Inputs, Actions);
|
||||
}
|
||||
|
||||
void Driver::BuildDefaultActions(Compilation &C, DerivedArgList &Args,
|
||||
const InputList &Inputs,
|
||||
ActionList &Actions) const {
|
||||
|
||||
bool UseNewOffloadingDriver =
|
||||
C.isOffloadingHostKind(Action::OFK_OpenMP) ||
|
||||
C.isOffloadingHostKind(Action::OFK_SYCL) ||
|
||||
@ -4608,6 +4668,13 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
|
||||
Args.ClaimAllArgs(options::OPT_cl_ignored_Group);
|
||||
}
|
||||
|
||||
void Driver::BuildDriverManagedModuleBuildActions(
|
||||
Compilation &C, llvm::opt::DerivedArgList &Args, const InputList &Inputs,
|
||||
ActionList &Actions) const {
|
||||
Diags.Report(diag::remark_performing_driver_managed_module_build);
|
||||
return;
|
||||
}
|
||||
|
||||
/// Returns the canonical name for the offloading architecture when using a HIP
|
||||
/// or CUDA architecture.
|
||||
static StringRef getCanonicalArchString(Compilation &C,
|
||||
|
||||
@ -83,6 +83,8 @@ struct Scanner {
|
||||
/// \returns True on error.
|
||||
bool scan(SmallVectorImpl<Directive> &Directives);
|
||||
|
||||
friend bool clang::scanInputForCXX20ModulesUsage(StringRef Source);
|
||||
|
||||
private:
|
||||
/// Lexes next token and advances \p First and the \p Lexer.
|
||||
[[nodiscard]] dependency_directives_scan::Token &
|
||||
@ -1075,3 +1077,51 @@ void clang::printDependencyDirectivesAsSource(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void skipUntilMaybeCXX20ModuleDirective(const char *&First,
|
||||
const char *const End) {
|
||||
assert(First <= End);
|
||||
while (First != End) {
|
||||
if (*First == '#') {
|
||||
++First;
|
||||
skipToNewlineRaw(First, End);
|
||||
}
|
||||
skipWhitespace(First, End);
|
||||
if (const auto Len = isEOL(First, End)) {
|
||||
First += Len;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool clang::scanInputForCXX20ModulesUsage(StringRef Source) {
|
||||
const char *First = Source.begin();
|
||||
const char *const End = Source.end();
|
||||
skipUntilMaybeCXX20ModuleDirective(First, End);
|
||||
if (First == End)
|
||||
return false;
|
||||
|
||||
// Check if the next token can even be a module directive before creating a
|
||||
// full lexer.
|
||||
if (!(*First == 'i' || *First == 'e' || *First == 'm'))
|
||||
return false;
|
||||
|
||||
llvm::SmallVector<dependency_directives_scan::Token> Tokens;
|
||||
Scanner S(StringRef(First, End - First), Tokens, nullptr, SourceLocation());
|
||||
S.TheLexer.setParsingPreprocessorDirective(true);
|
||||
if (S.lexModule(First, End))
|
||||
return false;
|
||||
auto IsCXXNamedModuleDirective = [](const DirectiveWithTokens &D) {
|
||||
switch (D.Kind) {
|
||||
case dependency_directives_scan::cxx_module_decl:
|
||||
case dependency_directives_scan::cxx_import_decl:
|
||||
case dependency_directives_scan::cxx_export_module_decl:
|
||||
case dependency_directives_scan::cxx_export_import_decl:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
};
|
||||
return llvm::any_of(S.DirsWithToks, IsCXXNamedModuleDirective);
|
||||
}
|
||||
|
||||
@ -6224,7 +6224,6 @@ void Parser::ParseTypeQualifierListOpt(
|
||||
case tok::kw___funcref:
|
||||
ParseWebAssemblyFuncrefTypeAttribute(DS.getAttributes());
|
||||
continue;
|
||||
goto DoneWithTypeQuals;
|
||||
|
||||
case tok::kw___pascal:
|
||||
if (AttrReqs & AR_VendorAttributesParsed) {
|
||||
|
||||
@ -99,28 +99,84 @@ def generate_cpp_merge_test(n: int) -> str:
|
||||
return cpp_code
|
||||
|
||||
|
||||
def analyze_trace_file(trace_path: str) -> tuple[float, float]:
|
||||
def generate_cpp_nested_loop_test(n: int) -> str:
|
||||
"""
|
||||
Parses the -ftime-trace JSON output to find durations.
|
||||
Generates C++ code with N levels of nested loops.
|
||||
This pattern tests how analysis performance scales with loop nesting depth,
|
||||
which is a key factor in the complexity of dataflow analyses on structured
|
||||
control flow.
|
||||
|
||||
Returns:
|
||||
A tuple of (lifetime_analysis_duration_us, total_clang_duration_us).
|
||||
Example (n=3):
|
||||
struct MyObj { int id; ~MyObj() {} };
|
||||
void nested_loops_3() {
|
||||
MyObj* p = nullptr;
|
||||
for(int i0=0; i0<2; ++i0) {
|
||||
MyObj s0;
|
||||
p = &s0;
|
||||
for(int i1=0; i1<2; ++i1) {
|
||||
MyObj s1;
|
||||
p = &s1;
|
||||
for(int i2=0; i2<2; ++i2) {
|
||||
MyObj s2;
|
||||
p = &s2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
lifetime_duration = 0.0
|
||||
total_duration = 0.0
|
||||
if n <= 0:
|
||||
return "// Nesting depth must be positive."
|
||||
|
||||
cpp_code = "struct MyObj { int id; ~MyObj() {} };\n\n"
|
||||
cpp_code += f"void nested_loops_{n}() {{\n"
|
||||
cpp_code += " MyObj* p = nullptr;\n"
|
||||
|
||||
for i in range(n):
|
||||
indent = " " * (i + 1)
|
||||
cpp_code += f"{indent}for(int i{i}=0; i{i}<2; ++i{i}) {{\n"
|
||||
cpp_code += f"{indent} MyObj s{i}; p = &s{i};\n"
|
||||
|
||||
for i in range(n - 1, -1, -1):
|
||||
indent = " " * (i + 1)
|
||||
cpp_code += f"{indent}}}\n"
|
||||
|
||||
cpp_code += "}\n"
|
||||
cpp_code += f"\nint main() {{ nested_loops_{n}(); return 0; }}\n"
|
||||
return cpp_code
|
||||
|
||||
|
||||
def analyze_trace_file(trace_path: str) -> dict:
|
||||
"""
|
||||
Parses the -ftime-trace JSON output to find durations for the lifetime
|
||||
analysis and its sub-phases.
|
||||
Returns a dictionary of durations in microseconds.
|
||||
"""
|
||||
durations = {
|
||||
"lifetime_us": 0.0,
|
||||
"total_us": 0.0,
|
||||
"fact_gen_us": 0.0,
|
||||
"loan_prop_us": 0.0,
|
||||
"expired_loans_us": 0.0,
|
||||
}
|
||||
event_name_map = {
|
||||
"LifetimeSafetyAnalysis": "lifetime_us",
|
||||
"ExecuteCompiler": "total_us",
|
||||
"FactGenerator": "fact_gen_us",
|
||||
"LoanPropagation": "loan_prop_us",
|
||||
"ExpiredLoans": "expired_loans_us",
|
||||
}
|
||||
try:
|
||||
with open(trace_path, "r") as f:
|
||||
trace_data = json.load(f)
|
||||
for event in trace_data.get("traceEvents", []):
|
||||
if event.get("name") == "LifetimeSafetyAnalysis":
|
||||
lifetime_duration += float(event.get("dur", 0))
|
||||
if event.get("name") == "ExecuteCompiler":
|
||||
total_duration += float(event.get("dur", 0))
|
||||
|
||||
event_name = event.get("name")
|
||||
if event_name in event_name_map:
|
||||
key = event_name_map[event_name]
|
||||
durations[key] += float(event.get("dur", 0))
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
print(f"Error reading or parsing trace file {trace_path}: {e}", file=sys.stderr)
|
||||
return 0.0, 0.0
|
||||
return lifetime_duration, total_duration
|
||||
return {key: 0.0 for key in durations}
|
||||
return durations
|
||||
|
||||
|
||||
def power_law(n, c, k):
|
||||
@ -135,8 +191,29 @@ def human_readable_time(ms: float) -> str:
|
||||
return f"{ms:.2f} ms"
|
||||
|
||||
|
||||
def calculate_complexity(n_data, y_data) -> tuple[float | None, float | None]:
|
||||
"""
|
||||
Calculates the exponent 'k' for the power law fit y = c * n^k.
|
||||
Returns a tuple of (k, k_standard_error).
|
||||
"""
|
||||
try:
|
||||
if len(n_data) < 3 or np.all(y_data < 1e-6) or np.var(y_data) < 1e-6:
|
||||
return None, None
|
||||
|
||||
non_zero_indices = y_data > 0
|
||||
if np.sum(non_zero_indices) < 3:
|
||||
return None, None
|
||||
|
||||
n_fit, y_fit = n_data[non_zero_indices], y_data[non_zero_indices]
|
||||
popt, pcov = curve_fit(power_law, n_fit, y_fit, p0=[0, 1], maxfev=5000)
|
||||
k_stderr = np.sqrt(np.diag(pcov))[1]
|
||||
return popt[1], k_stderr
|
||||
except (RuntimeError, ValueError):
|
||||
return None, None
|
||||
|
||||
|
||||
def generate_markdown_report(results: dict) -> str:
|
||||
"""Generates a Markdown-formatted report from the benchmark results."""
|
||||
"""Generates a concise, Markdown-formatted report from the benchmark results."""
|
||||
report = []
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
|
||||
report.append(f"# Lifetime Analysis Performance Report")
|
||||
@ -146,54 +223,52 @@ def generate_markdown_report(results: dict) -> str:
|
||||
for test_name, data in results.items():
|
||||
title = data["title"]
|
||||
report.append(f"## Test Case: {title}")
|
||||
report.append("")
|
||||
report.append("\n**Timing Results:**\n")
|
||||
|
||||
# Table header
|
||||
report.append("| N | Analysis Time | Total Clang Time |")
|
||||
report.append("|:----|--------------:|-----------------:|")
|
||||
report.append(
|
||||
"| N (Input Size) | Total Time | Analysis Time (%) | Fact Generator (%) | Loan Propagation (%) | Expired Loans (%) |"
|
||||
)
|
||||
report.append(
|
||||
"|:---------------|-----------:|------------------:|-------------------:|---------------------:|------------------:|"
|
||||
)
|
||||
|
||||
# Table rows
|
||||
n_data = np.array(data["n"])
|
||||
analysis_data = np.array(data["lifetime_ms"])
|
||||
total_data = np.array(data["total_ms"])
|
||||
total_ms_data = np.array(data["total_ms"])
|
||||
for i in range(len(n_data)):
|
||||
analysis_str = human_readable_time(analysis_data[i])
|
||||
total_str = human_readable_time(total_data[i])
|
||||
report.append(f"| {n_data[i]:<3} | {analysis_str:>13} | {total_str:>16} |")
|
||||
total_t = total_ms_data[i]
|
||||
if total_t < 1e-6:
|
||||
total_t = 1.0 # Avoid division by zero
|
||||
|
||||
report.append("")
|
||||
row = [
|
||||
f"| {n_data[i]:<14} |",
|
||||
f"{human_readable_time(total_t):>10} |",
|
||||
f"{data['lifetime_ms'][i] / total_t * 100:>17.2f}% |",
|
||||
f"{data['fact_gen_ms'][i] / total_t * 100:>18.2f}% |",
|
||||
f"{data['loan_prop_ms'][i] / total_t * 100:>20.2f}% |",
|
||||
f"{data['expired_loans_ms'][i] / total_t * 100:>17.2f}% |",
|
||||
]
|
||||
report.append(" ".join(row))
|
||||
|
||||
# Complexity analysis
|
||||
report.append(f"**Complexity Analysis:**")
|
||||
try:
|
||||
# Curve fitting requires at least 3 points
|
||||
if len(n_data) < 3:
|
||||
raise ValueError("Not enough data points to perform curve fitting.")
|
||||
report.append("\n**Complexity Analysis:**\n")
|
||||
report.append("| Analysis Phase | Complexity O(n<sup>k</sup>) |")
|
||||
report.append("|:------------------|:--------------------------|")
|
||||
|
||||
popt, pcov = curve_fit(
|
||||
power_law, n_data, analysis_data, p0=[0, 2], maxfev=5000
|
||||
)
|
||||
_, k = popt
|
||||
analysis_phases = {
|
||||
"Total Analysis": data["lifetime_ms"],
|
||||
"FactGenerator": data["fact_gen_ms"],
|
||||
"LoanPropagation": data["loan_prop_ms"],
|
||||
"ExpiredLoans": data["expired_loans_ms"],
|
||||
}
|
||||
|
||||
# Confidence Interval for k
|
||||
alpha = 0.05 # 95% confidence
|
||||
dof = max(0, len(n_data) - len(popt)) # degrees of freedom
|
||||
t_val = t.ppf(1.0 - alpha / 2.0, dof)
|
||||
# Standard error of the parameters
|
||||
perr = np.sqrt(np.diag(pcov))
|
||||
k_stderr = perr[1]
|
||||
k_ci_lower = k - t_val * k_stderr
|
||||
k_ci_upper = k + t_val * k_stderr
|
||||
|
||||
report.append(
|
||||
f"- The performance for this case scales approx. as **O(n<sup>{k:.2f}</sup>)**."
|
||||
)
|
||||
report.append(
|
||||
f"- **95% Confidence interval for exponent:** `[{k_ci_lower:.2f}, {k_ci_upper:.2f}]`."
|
||||
)
|
||||
|
||||
except (RuntimeError, ValueError) as e:
|
||||
report.append(f"- Could not determine a best-fit curve for the data: {e}")
|
||||
for phase_name, y_data in analysis_phases.items():
|
||||
k, delta = calculate_complexity(n_data, np.array(y_data))
|
||||
if k is not None and delta is not None:
|
||||
complexity_str = f"O(n<sup>{k:.2f}</sup> ± {delta:.2f})"
|
||||
else:
|
||||
complexity_str = "(Negligible)"
|
||||
report.append(f"| {phase_name:<17} | {complexity_str:<25} |")
|
||||
|
||||
report.append("\n---\n")
|
||||
|
||||
@ -202,7 +277,7 @@ def generate_markdown_report(results: dict) -> str:
|
||||
|
||||
def run_single_test(
|
||||
clang_binary: str, output_dir: str, test_name: str, generator_func, n: int
|
||||
) -> tuple[float, float]:
|
||||
) -> dict:
|
||||
"""Generates, compiles, and benchmarks a single test case."""
|
||||
print(f"--- Running Test: {test_name.capitalize()} with N={n} ---")
|
||||
|
||||
@ -221,7 +296,8 @@ def run_single_test(
|
||||
"-o",
|
||||
"/dev/null",
|
||||
"-ftime-trace=" + trace_file,
|
||||
"-Wexperimental-lifetime-safety",
|
||||
"-Xclang",
|
||||
"-fexperimental-lifetime-safety",
|
||||
"-std=c++17",
|
||||
source_file,
|
||||
]
|
||||
@ -231,11 +307,12 @@ def run_single_test(
|
||||
if result.returncode != 0:
|
||||
print(f"Compilation failed for N={n}!", file=sys.stderr)
|
||||
print(result.stderr, file=sys.stderr)
|
||||
return 0.0, 0.0
|
||||
return {}
|
||||
|
||||
lifetime_us, total_us = analyze_trace_file(trace_file)
|
||||
|
||||
return lifetime_us / 1000.0, total_us / 1000.0
|
||||
durations_us = analyze_trace_file(trace_file)
|
||||
return {
|
||||
key.replace("_us", "_ms"): value / 1000.0 for key, value in durations_us.items()
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -270,6 +347,12 @@ if __name__ == "__main__":
|
||||
"generator_func": generate_cpp_merge_test,
|
||||
"n_values": [10, 50, 100, 200, 400, 800],
|
||||
},
|
||||
{
|
||||
"name": "nested_loops",
|
||||
"title": "Deeply Nested Loops",
|
||||
"generator_func": generate_cpp_nested_loop_test,
|
||||
"n_values": [10, 50, 100, 200, 400, 800],
|
||||
},
|
||||
]
|
||||
|
||||
results = {}
|
||||
@ -282,21 +365,28 @@ if __name__ == "__main__":
|
||||
"n": [],
|
||||
"lifetime_ms": [],
|
||||
"total_ms": [],
|
||||
"fact_gen_ms": [],
|
||||
"loan_prop_ms": [],
|
||||
"expired_loans_ms": [],
|
||||
}
|
||||
for n in config["n_values"]:
|
||||
lifetime_ms, total_ms = run_single_test(
|
||||
durations_ms = run_single_test(
|
||||
args.clang_binary,
|
||||
args.output_dir,
|
||||
test_name,
|
||||
config["generator_func"],
|
||||
n,
|
||||
)
|
||||
if total_ms > 0:
|
||||
if durations_ms:
|
||||
results[test_name]["n"].append(n)
|
||||
results[test_name]["lifetime_ms"].append(lifetime_ms)
|
||||
results[test_name]["total_ms"].append(total_ms)
|
||||
for key, value in durations_ms.items():
|
||||
results[test_name][key].append(value)
|
||||
|
||||
print(
|
||||
f" Total: {human_readable_time(total_ms)} | Analysis: {human_readable_time(lifetime_ms)}"
|
||||
f" Total Analysis: {human_readable_time(durations_ms['lifetime_ms'])} | "
|
||||
f"FactGen: {human_readable_time(durations_ms['fact_gen_ms'])} | "
|
||||
f"LoanProp: {human_readable_time(durations_ms['loan_prop_ms'])} | "
|
||||
f"ExpiredLoans: {human_readable_time(durations_ms['expired_loans_ms'])}"
|
||||
)
|
||||
|
||||
print("\n\n" + "=" * 80)
|
||||
@ -305,3 +395,8 @@ if __name__ == "__main__":
|
||||
|
||||
markdown_report = generate_markdown_report(results)
|
||||
print(markdown_report)
|
||||
|
||||
report_filename = os.path.join(args.output_dir, "performance_report.md")
|
||||
with open(report_filename, "w") as f:
|
||||
f.write(markdown_report)
|
||||
print(f"Report saved to: {report_filename}")
|
||||
|
||||
192
clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp
Normal file
192
clang/test/Driver/modules-driver-cxx20-module-usage-scanner.cpp
Normal file
@ -0,0 +1,192 @@
|
||||
// The driver never checks to implicitly enable the explicit module build
|
||||
// support unless at least two input files are provided.
|
||||
// To trigger the C++20 module usage check, we always pass a second dummy file
|
||||
// as input.
|
||||
// TODO: Remove -fmodules everywhere once implicitly enabled explicit module
|
||||
// builds are supported.
|
||||
|
||||
// RUN: split-file %s %t
|
||||
//--- empty.cpp
|
||||
// Nothing here
|
||||
|
||||
//--- only-global.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/only-global.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK1
|
||||
// CHECK1: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
module;
|
||||
|
||||
//--- only-import.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/only-import.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK2
|
||||
// CHECK2: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
import A;
|
||||
|
||||
//--- only-export.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/only-export.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK3
|
||||
// CHECK3: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
export module A;
|
||||
|
||||
//--- leading-line-comment.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-line-comment.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK4
|
||||
// CHECK4: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
// My line comment
|
||||
import A;
|
||||
|
||||
//--- leading-block-comment1.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-block-comment1.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK5
|
||||
// CHECK5: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
/*My block comment */
|
||||
import A;
|
||||
|
||||
//--- leading-block-comment2.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-block-comment2.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK6
|
||||
// CHECK6: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
/*My line comment */ import A;
|
||||
|
||||
//--- inline-block-comment1.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-block-comment1.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK7
|
||||
// CHECK7: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
export/*a comment*/module/*another comment*/A;
|
||||
|
||||
//--- inline-block-comment2.cpp
|
||||
// RUN: %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-block-comment2.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK8
|
||||
// CHECK8: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
module/*a comment*/;
|
||||
|
||||
//--- leading-directives.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-directives.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK9
|
||||
// CHECK9: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
#define A
|
||||
#undef A
|
||||
#if A
|
||||
#ifdef A
|
||||
#elifdef A
|
||||
#elifndef A
|
||||
#endif
|
||||
#ifndef A
|
||||
#elif A
|
||||
#else
|
||||
#endif
|
||||
#endif
|
||||
#pragma once;
|
||||
#include <iostream>
|
||||
import m;
|
||||
|
||||
//--- multiline-directive.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/multiline-directive.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK10
|
||||
// CHECK10: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
#define MACRO(a, \
|
||||
b) \
|
||||
call((a), \
|
||||
(b)
|
||||
import a;
|
||||
|
||||
//--- leading-line-splice.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-line-splice.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK11
|
||||
// CHECK11: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
\
|
||||
module;
|
||||
|
||||
//--- leading-line-splice-trailing-whitespace.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/leading-line-splice-trailing-whitespace.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK12
|
||||
// CHECK12: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
// v This backslash has trailing whitespace.
|
||||
\
|
||||
export module A;
|
||||
|
||||
//--- comment-line-splice.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/comment-line-splice.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK13
|
||||
// CHECK13-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
// My comment continues next-line!\
|
||||
import A;
|
||||
|
||||
//--- comment-line-splice-trailing-whitespace.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/comment-line-splice-trailing-whitespace.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK14
|
||||
// CHECK14-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
// My comment continues next-line! This backslash has trailing whitespace. -> \
|
||||
module;
|
||||
|
||||
//--- line-splice-in-directive1.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/line-splice-in-directive1.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK15
|
||||
// CHECK15: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
|
||||
module\
|
||||
;
|
||||
|
||||
//--- line-splice-in-directive2.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/line-splice-in-directive2.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK16
|
||||
// CHECK16: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
|
||||
export\
|
||||
module\
|
||||
A;
|
||||
|
||||
//--- no-module-usage1.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/no-module-usage1.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK17
|
||||
// CHECK17-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
auto main() -> int {}
|
||||
|
||||
//--- no-module-usage2.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/no-module-usage2.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK18
|
||||
// CHECK18-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
moduleStruct{};
|
||||
|
||||
//--- no-module-usage3.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/no-module-usage3.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK19
|
||||
// CHECK19-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
export_struct{};
|
||||
|
||||
//--- no-module-usage-namespace-import.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/no-module-usage-namespace-import.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK20
|
||||
// CHECK20-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
import::inner xi = {};
|
||||
|
||||
//--- no-module-usage-namespace-module.cpp
|
||||
// RUN: %clang -std=c++23 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: %t/no-module-usage-namespace-module.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --allow-empty --check-prefix=CHECK21
|
||||
// CHECK21-NOT: remark: found C++20 module usage in file '{{.*}}' [-Rmodules-driver]
|
||||
module::inner yi = {};
|
||||
|
||||
// RUN: not %clang -std=c++20 -ccc-print-phases -fmodules-driver -Rmodules-driver \
|
||||
// RUN: imaginary-file.cpp %t/empty.cpp 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK-NON-EXISTING-FILE-ERR
|
||||
// CHECK-NON-EXISTING-FILE-ERR: clang: error: no such file or directory: 'imaginary-file.cpp'
|
||||
@ -10,6 +10,10 @@
|
||||
|
||||
// REQUIRES: stable-runtime
|
||||
|
||||
// rdar://158303080 top few frames are at times inaccurate in ubsan fast stack
|
||||
// unwind on darwin
|
||||
// XFAIL: (darwin && ubsan && (arm64-target-arch || arm64e-target-arch))
|
||||
|
||||
// XFAIL: target={{.*netbsd.*}} && !asan
|
||||
|
||||
volatile int *null = 0;
|
||||
|
||||
@ -219,6 +219,7 @@ using DistSchedule = tomp::clause::DistScheduleT<TypeTy, IdTy, ExprTy>;
|
||||
using Doacross = tomp::clause::DoacrossT<TypeTy, IdTy, ExprTy>;
|
||||
using DynamicAllocators =
|
||||
tomp::clause::DynamicAllocatorsT<TypeTy, IdTy, ExprTy>;
|
||||
using DynGroupprivate = tomp::clause::DynGroupprivateT<TypeTy, IdTy, ExprTy>;
|
||||
using Enter = tomp::clause::EnterT<TypeTy, IdTy, ExprTy>;
|
||||
using Exclusive = tomp::clause::ExclusiveT<TypeTy, IdTy, ExprTy>;
|
||||
using Fail = tomp::clause::FailT<TypeTy, IdTy, ExprTy>;
|
||||
|
||||
@ -525,6 +525,8 @@ public:
|
||||
NODE(parser, OmpAbsentClause)
|
||||
NODE(parser, OmpAffinityClause)
|
||||
NODE(OmpAffinityClause, Modifier)
|
||||
NODE(parser, OmpAccessGroup)
|
||||
NODE_ENUM(OmpAccessGroup, Value)
|
||||
NODE(parser, OmpAlignment)
|
||||
NODE(parser, OmpAlignClause)
|
||||
NODE(parser, OmpAlignedClause)
|
||||
@ -569,6 +571,8 @@ public:
|
||||
NODE_ENUM(OmpDependenceType, Value)
|
||||
NODE(parser, OmpTaskDependenceType)
|
||||
NODE_ENUM(OmpTaskDependenceType, Value)
|
||||
NODE(parser, OmpDynGroupprivateClause)
|
||||
NODE(OmpDynGroupprivateClause, Modifier)
|
||||
NODE(parser, OmpIndirectClause)
|
||||
NODE(parser, OmpIterationOffset)
|
||||
NODE(parser, OmpIteration)
|
||||
|
||||
@ -3736,6 +3736,11 @@ inline namespace modifier {
|
||||
// ENUM_CLASS(Value, Keyword1, Keyword2);
|
||||
// };
|
||||
|
||||
struct OmpAccessGroup {
|
||||
ENUM_CLASS(Value, Cgroup);
|
||||
WRAPPER_CLASS_BOILERPLATE(OmpAccessGroup, Value);
|
||||
};
|
||||
|
||||
// Ref: [4.5:72-81], [5.0:110-119], [5.1:134-143], [5.2:169-170]
|
||||
//
|
||||
// alignment ->
|
||||
@ -4019,8 +4024,9 @@ struct OmpOrderModifier {
|
||||
//
|
||||
// prescriptiveness ->
|
||||
// STRICT // since 5.1
|
||||
// FALLBACK // since 6.1
|
||||
struct OmpPrescriptiveness {
|
||||
ENUM_CLASS(Value, Strict)
|
||||
ENUM_CLASS(Value, Strict, Fallback)
|
||||
WRAPPER_CLASS_BOILERPLATE(OmpPrescriptiveness, Value);
|
||||
};
|
||||
|
||||
@ -4375,6 +4381,12 @@ struct OmpDeviceTypeClause {
|
||||
WRAPPER_CLASS_BOILERPLATE(OmpDeviceTypeClause, DeviceTypeDescription);
|
||||
};
|
||||
|
||||
struct OmpDynGroupprivateClause {
|
||||
TUPLE_CLASS_BOILERPLATE(OmpDynGroupprivateClause);
|
||||
MODIFIER_BOILERPLATE(OmpAccessGroup, OmpPrescriptiveness);
|
||||
std::tuple<MODIFIERS(), ScalarIntExpr> t;
|
||||
};
|
||||
|
||||
// Ref: [5.2:158-159], [6.0:289-290]
|
||||
//
|
||||
// enter-clause ->
|
||||
|
||||
@ -396,6 +396,8 @@ makePrescriptiveness(parser::OmpPrescriptiveness::Value v) {
|
||||
switch (v) {
|
||||
case parser::OmpPrescriptiveness::Value::Strict:
|
||||
return clause::Prescriptiveness::Strict;
|
||||
case parser::OmpPrescriptiveness::Value::Fallback:
|
||||
return clause::Prescriptiveness::Fallback;
|
||||
}
|
||||
llvm_unreachable("Unexpected prescriptiveness");
|
||||
}
|
||||
@ -770,6 +772,27 @@ Doacross make(const parser::OmpClause::Doacross &inp,
|
||||
|
||||
// DynamicAllocators: empty
|
||||
|
||||
DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp,
|
||||
semantics::SemanticsContext &semaCtx) {
|
||||
// imp.v -> OmpDyngroupprivateClause
|
||||
CLAUSET_ENUM_CONVERT( //
|
||||
convert, parser::OmpAccessGroup::Value, DynGroupprivate::AccessGroup,
|
||||
// clang-format off
|
||||
MS(Cgroup, Cgroup)
|
||||
// clang-format on
|
||||
);
|
||||
|
||||
auto &mods = semantics::OmpGetModifiers(inp.v);
|
||||
auto *m0 = semantics::OmpGetUniqueModifier<parser::OmpAccessGroup>(mods);
|
||||
auto *m1 = semantics::OmpGetUniqueModifier<parser::OmpPrescriptiveness>(mods);
|
||||
auto &size = std::get<parser::ScalarIntExpr>(inp.v.t);
|
||||
|
||||
return DynGroupprivate{
|
||||
{/*AccessGroup=*/maybeApplyToV(convert, m0),
|
||||
/*Prescriptiveness=*/maybeApplyToV(makePrescriptiveness, m1),
|
||||
/*Size=*/makeExpr(size, semaCtx)}};
|
||||
}
|
||||
|
||||
Enter make(const parser::OmpClause::Enter &inp,
|
||||
semantics::SemanticsContext &semaCtx) {
|
||||
// inp.v -> parser::OmpEnterClause
|
||||
|
||||
@ -469,6 +469,9 @@ TYPE_PARSER(sourced(construct<OmpContextSelectorSpecification>(
|
||||
|
||||
// --- Parsers for clause modifiers -----------------------------------
|
||||
|
||||
TYPE_PARSER(construct<OmpAccessGroup>( //
|
||||
"CGROUP" >> pure(OmpAccessGroup::Value::Cgroup)))
|
||||
|
||||
TYPE_PARSER(construct<OmpAlignment>(scalarIntExpr))
|
||||
|
||||
TYPE_PARSER(construct<OmpAlignModifier>( //
|
||||
@ -573,7 +576,8 @@ TYPE_PARSER(construct<OmpOrderingModifier>(
|
||||
"SIMD" >> pure(OmpOrderingModifier::Value::Simd)))
|
||||
|
||||
TYPE_PARSER(construct<OmpPrescriptiveness>(
|
||||
"STRICT" >> pure(OmpPrescriptiveness::Value::Strict)))
|
||||
"STRICT" >> pure(OmpPrescriptiveness::Value::Strict) ||
|
||||
"FALLBACK" >> pure(OmpPrescriptiveness::Value::Fallback)))
|
||||
|
||||
TYPE_PARSER(construct<OmpPresentModifier>( //
|
||||
"PRESENT" >> pure(OmpPresentModifier::Value::Present)))
|
||||
@ -636,6 +640,12 @@ TYPE_PARSER(sourced(construct<OmpDependClause::TaskDep::Modifier>(sourced(
|
||||
construct<OmpDependClause::TaskDep::Modifier>(
|
||||
Parser<OmpTaskDependenceType>{})))))
|
||||
|
||||
TYPE_PARSER( //
|
||||
sourced(construct<OmpDynGroupprivateClause::Modifier>(
|
||||
Parser<OmpAccessGroup>{})) ||
|
||||
sourced(construct<OmpDynGroupprivateClause::Modifier>(
|
||||
Parser<OmpPrescriptiveness>{})))
|
||||
|
||||
TYPE_PARSER(
|
||||
sourced(construct<OmpDeviceClause::Modifier>(Parser<OmpDeviceModifier>{})))
|
||||
|
||||
@ -777,6 +787,10 @@ TYPE_PARSER(construct<OmpDefaultClause>(
|
||||
Parser<OmpDefaultClause::DataSharingAttribute>{}) ||
|
||||
construct<OmpDefaultClause>(indirect(Parser<OmpDirectiveSpecification>{}))))
|
||||
|
||||
TYPE_PARSER(construct<OmpDynGroupprivateClause>(
|
||||
maybe(nonemptyList(Parser<OmpDynGroupprivateClause::Modifier>{}) / ":"),
|
||||
scalarIntExpr))
|
||||
|
||||
TYPE_PARSER(construct<OmpEnterClause>(
|
||||
maybe(nonemptyList(Parser<OmpEnterClause::Modifier>{}) / ":"),
|
||||
Parser<OmpObjectList>{}))
|
||||
@ -1068,6 +1082,9 @@ TYPE_PARSER( //
|
||||
construct<OmpClause>(parenthesized(Parser<OmpDoacrossClause>{})) ||
|
||||
"DYNAMIC_ALLOCATORS" >>
|
||||
construct<OmpClause>(construct<OmpClause::DynamicAllocators>()) ||
|
||||
"DYN_GROUPPRIVATE" >>
|
||||
construct<OmpClause>(construct<OmpClause::DynGroupprivate>(
|
||||
parenthesized(Parser<OmpDynGroupprivateClause>{}))) ||
|
||||
"ENTER" >> construct<OmpClause>(construct<OmpClause::Enter>(
|
||||
parenthesized(Parser<OmpEnterClause>{}))) ||
|
||||
"EXCLUSIVE" >> construct<OmpClause>(construct<OmpClause::Exclusive>(
|
||||
|
||||
@ -2250,6 +2250,11 @@ public:
|
||||
Walk(std::get<OmpObjectList>(x.t));
|
||||
Walk(": ", std::get<std::optional<std::list<Modifier>>>(x.t));
|
||||
}
|
||||
void Unparse(const OmpDynGroupprivateClause &x) {
|
||||
using Modifier = OmpDynGroupprivateClause::Modifier;
|
||||
Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
|
||||
Walk(std::get<ScalarIntExpr>(x.t));
|
||||
}
|
||||
void Unparse(const OmpEnterClause &x) {
|
||||
using Modifier = OmpEnterClause::Modifier;
|
||||
Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
|
||||
@ -2941,6 +2946,7 @@ public:
|
||||
WALK_NESTED_ENUM(OmpTaskDependenceType, Value) // OMP task-dependence-type
|
||||
WALK_NESTED_ENUM(OmpScheduleClause, Kind) // OMP schedule-kind
|
||||
WALK_NESTED_ENUM(OmpSeverityClause, Severity) // OMP severity
|
||||
WALK_NESTED_ENUM(OmpAccessGroup, Value)
|
||||
WALK_NESTED_ENUM(OmpDeviceModifier, Value) // OMP device modifier
|
||||
WALK_NESTED_ENUM(
|
||||
OmpDeviceTypeClause, DeviceTypeDescription) // OMP device_type
|
||||
|
||||
@ -2581,6 +2581,7 @@ CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
|
||||
CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
|
||||
CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
|
||||
CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
|
||||
CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
|
||||
CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
|
||||
CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
|
||||
CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
|
||||
|
||||
10
flang/test/Lower/OpenMP/Todo/dyn-groupprivate-clause.f90
Normal file
10
flang/test/Lower/OpenMP/Todo/dyn-groupprivate-clause.f90
Normal file
@ -0,0 +1,10 @@
|
||||
!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=61 -o - %s 2>&1 | FileCheck %s
|
||||
|
||||
!CHECK: not yet implemented: DYN_GROUPPRIVATE clause is not implemented yet
|
||||
subroutine f00(n)
|
||||
implicit none
|
||||
integer :: n
|
||||
!$omp target dyn_groupprivate(n)
|
||||
!$omp end target
|
||||
end
|
||||
|
||||
70
flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
Normal file
70
flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
Normal file
@ -0,0 +1,70 @@
|
||||
!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=61 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
|
||||
!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=61 %s | FileCheck --check-prefix="PARSE-TREE" %s
|
||||
|
||||
subroutine f00(n)
|
||||
implicit none
|
||||
integer :: n
|
||||
!$omp target dyn_groupprivate(n)
|
||||
!$omp end target
|
||||
end
|
||||
|
||||
!UNPARSE: SUBROUTINE f00 (n)
|
||||
!UNPARSE: IMPLICIT NONE
|
||||
!UNPARSE: INTEGER n
|
||||
!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(n)
|
||||
!UNPARSE: !$OMP END TARGET
|
||||
!UNPARSE: END SUBROUTINE
|
||||
|
||||
!PARSE-TREE: OmpBeginDirective
|
||||
!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
|
||||
!PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause
|
||||
!PARSE-TREE: | | Scalar -> Integer -> Expr = 'n'
|
||||
!PARSE-TREE: | | | Designator -> DataRef -> Name = 'n'
|
||||
!PARSE-TREE: | Flags = None
|
||||
|
||||
|
||||
subroutine f01(n)
|
||||
implicit none
|
||||
integer :: n
|
||||
!$omp target dyn_groupprivate(strict: n)
|
||||
!$omp end target
|
||||
end
|
||||
|
||||
!UNPARSE: SUBROUTINE f01 (n)
|
||||
!UNPARSE: IMPLICIT NONE
|
||||
!UNPARSE: INTEGER n
|
||||
!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(STRICT: n)
|
||||
!UNPARSE: !$OMP END TARGET
|
||||
!UNPARSE: END SUBROUTINE
|
||||
|
||||
!PARSE-TREE: OmpBeginDirective
|
||||
!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
|
||||
!PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause
|
||||
!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Strict
|
||||
!PARSE-TREE: | | Scalar -> Integer -> Expr = 'n'
|
||||
!PARSE-TREE: | | | Designator -> DataRef -> Name = 'n'
|
||||
!PARSE-TREE: | Flags = None
|
||||
|
||||
|
||||
subroutine f02(n)
|
||||
implicit none
|
||||
integer :: n
|
||||
!$omp target dyn_groupprivate(fallback, cgroup: n)
|
||||
!$omp end target
|
||||
end
|
||||
|
||||
!UNPARSE: SUBROUTINE f02 (n)
|
||||
!UNPARSE: IMPLICIT NONE
|
||||
!UNPARSE: INTEGER n
|
||||
!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK, CGROUP: n)
|
||||
!UNPARSE: !$OMP END TARGET
|
||||
!UNPARSE: END SUBROUTINE
|
||||
|
||||
!PARSE-TREE: OmpBeginDirective
|
||||
!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
|
||||
!PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause
|
||||
!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Fallback
|
||||
!PARSE-TREE: | | Modifier -> OmpAccessGroup -> Value = Cgroup
|
||||
!PARSE-TREE: | | Scalar -> Integer -> Expr = 'n'
|
||||
!PARSE-TREE: | | | Designator -> DataRef -> Name = 'n'
|
||||
!PARSE-TREE: | Flags = None
|
||||
@ -1464,8 +1464,8 @@ class ChildVisitingFormatter(BasicFormatter):
|
||||
return output.getvalue()
|
||||
|
||||
|
||||
class RecursiveDecentFormatter(BasicFormatter):
|
||||
"""The recursive decent formatter prints the value and the decendents.
|
||||
class RecursiveDescentFormatter(BasicFormatter):
|
||||
"""The recursive descent formatter prints the value and the descendents.
|
||||
|
||||
The constructor takes two keyword args: indent_level, which defaults to 0,
|
||||
and indent_child, which defaults to 2. The current indentation level is
|
||||
@ -1482,7 +1482,6 @@ class RecursiveDecentFormatter(BasicFormatter):
|
||||
output = io.StringIO()
|
||||
else:
|
||||
output = buffer
|
||||
|
||||
BasicFormatter.format(self, value, buffer=output, indent=self.lindent)
|
||||
new_indent = self.lindent + self.cindent
|
||||
for child in value:
|
||||
@ -1490,7 +1489,7 @@ class RecursiveDecentFormatter(BasicFormatter):
|
||||
BasicFormatter.format(self, child, buffer=output, indent=new_indent)
|
||||
else:
|
||||
if child.GetNumChildren() > 0:
|
||||
rdf = RecursiveDecentFormatter(indent_level=new_indent)
|
||||
rdf = RecursiveDescentFormatter(indent_level=new_indent)
|
||||
rdf.format(child, buffer=output)
|
||||
else:
|
||||
BasicFormatter.format(self, child, buffer=output, indent=new_indent)
|
||||
|
||||
@ -38,7 +38,7 @@ class PlatformProcessCrashInfoTestCase(TestBase):
|
||||
patterns=[
|
||||
"Extended Crash Information",
|
||||
"Crash-Info Annotations",
|
||||
"pointer being freed was not allocated",
|
||||
"BUG IN CLIENT OF LIBMALLOC",
|
||||
],
|
||||
)
|
||||
|
||||
@ -67,7 +67,7 @@ class PlatformProcessCrashInfoTestCase(TestBase):
|
||||
|
||||
self.assertTrue(crash_info.IsValid())
|
||||
|
||||
self.assertIn("pointer being freed was not allocated", stream.GetData())
|
||||
self.assertIn("BUG IN CLIENT OF LIBMALLOC", stream.GetData())
|
||||
|
||||
# dyld leaves permanent crash_info records when testing on device.
|
||||
@skipIfDarwinEmbedded
|
||||
|
||||
@ -83,7 +83,7 @@ class ValueAPITestCase(TestBase):
|
||||
|
||||
fmt = lldbutil.BasicFormatter()
|
||||
cvf = lldbutil.ChildVisitingFormatter(indent_child=2)
|
||||
rdf = lldbutil.RecursiveDecentFormatter(indent_child=2)
|
||||
rdf = lldbutil.RecursiveDescentFormatter(indent_child=2)
|
||||
if self.TraceOn():
|
||||
print(fmt.format(days_of_week))
|
||||
print(cvf.format(days_of_week))
|
||||
|
||||
@ -144,7 +144,7 @@ public:
|
||||
template <typename P>
|
||||
void
|
||||
RunOnce(const std::function<void(llvm::Expected<P>)> &callback,
|
||||
std::chrono::milliseconds timeout = std::chrono::milliseconds(100)) {
|
||||
std::chrono::milliseconds timeout = std::chrono::milliseconds(200)) {
|
||||
auto handle = m_transport_up->RegisterReadObject<P>(
|
||||
loop, [&](lldb_private::MainLoopBase &loop, llvm::Expected<P> message) {
|
||||
callback(std::move(message));
|
||||
|
||||
@ -1040,8 +1040,8 @@ class ChildVisitingFormatter(BasicFormatter):
|
||||
return output.getvalue()
|
||||
|
||||
|
||||
class RecursiveDecentFormatter(BasicFormatter):
|
||||
"""The recursive decent formatter prints the value and the decendents.
|
||||
class RecursiveDescentFormatter(BasicFormatter):
|
||||
"""The recursive descent formatter prints the value and the descendents.
|
||||
|
||||
The constructor takes two keyword args: indent_level, which defaults to 0,
|
||||
and indent_child, which defaults to 2. The current indentation level is
|
||||
@ -1058,7 +1058,6 @@ class RecursiveDecentFormatter(BasicFormatter):
|
||||
output = io.StringIO()
|
||||
else:
|
||||
output = buffer
|
||||
|
||||
BasicFormatter.format(self, value, buffer=output, indent=self.lindent)
|
||||
new_indent = self.lindent + self.cindent
|
||||
for child in value:
|
||||
@ -1066,7 +1065,7 @@ class RecursiveDecentFormatter(BasicFormatter):
|
||||
BasicFormatter.format(self, child, buffer=output, indent=new_indent)
|
||||
else:
|
||||
if child.GetNumChildren() > 0:
|
||||
rdf = RecursiveDecentFormatter(indent_level=new_indent)
|
||||
rdf = RecursiveDescentFormatter(indent_level=new_indent)
|
||||
rdf.format(child, buffer=output)
|
||||
else:
|
||||
BasicFormatter.format(self, child, buffer=output, indent=new_indent)
|
||||
|
||||
@ -891,7 +891,8 @@ option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
|
||||
option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
|
||||
option (LLVM_ENABLE_TELEMETRY "Enable the telemetry library. If set to OFF, library cannot be enabled after build (eg., at runtime)" ON)
|
||||
|
||||
if(UNIX AND CMAKE_SIZEOF_VOID_P GREATER_EQUAL 8)
|
||||
# Default to build OnDiskCAS on 64-bit systems.
|
||||
if(CMAKE_SIZEOF_VOID_P GREATER_EQUAL 8)
|
||||
set(LLVM_ENABLE_ONDISK_CAS_default ON)
|
||||
else()
|
||||
set(LLVM_ENABLE_ONDISK_CAS_default OFF)
|
||||
|
||||
@ -97,6 +97,13 @@ public:
|
||||
MappedFileRegionBumpPtr &operator=(const MappedFileRegionBumpPtr &) = delete;
|
||||
|
||||
private:
|
||||
// The file size increment to extend the storage size.
|
||||
// The minimum increment is a page, but allocate more to amortize the cost.
|
||||
static constexpr int64_t Increment = 4 * 1024 * 1024; // 4 MB
|
||||
|
||||
// Extend the AllocatedSize to be enough to hold NewEnd.
|
||||
Error extendSpaceImpl(int64_t NewEnd);
|
||||
|
||||
void destroyImpl();
|
||||
void moveImpl(MappedFileRegionBumpPtr &RHS) {
|
||||
std::swap(Region, RHS.Region);
|
||||
@ -114,7 +121,9 @@ private:
|
||||
RegionT Region;
|
||||
Header *H = nullptr;
|
||||
std::string Path;
|
||||
// File descriptor for the main storage file.
|
||||
std::optional<int> FD;
|
||||
// File descriptor for the file used as reader/writer lock.
|
||||
std::optional<int> SharedLockFD;
|
||||
};
|
||||
|
||||
|
||||
@ -242,7 +242,7 @@ ENUM(MotionExpectation, Present);
|
||||
// V5.2: [15.9.1] `task-dependence-type` modifier
|
||||
ENUM(DependenceType, Depobj, In, Inout, Inoutset, Mutexinoutset, Out, Sink,
|
||||
Source);
|
||||
ENUM(Prescriptiveness, Strict);
|
||||
ENUM(Prescriptiveness, Strict, Fallback);
|
||||
|
||||
template <typename I, typename E> //
|
||||
struct LoopIterationT {
|
||||
@ -574,6 +574,15 @@ struct DynamicAllocatorsT {
|
||||
using EmptyTrait = std::true_type;
|
||||
};
|
||||
|
||||
template <typename T, typename I, typename E> //
|
||||
struct DynGroupprivateT {
|
||||
ENUM(AccessGroup, Cgroup);
|
||||
using Prescriptiveness = type::Prescriptiveness;
|
||||
using Size = E;
|
||||
using TupleTrait = std::true_type;
|
||||
std::tuple<OPT(AccessGroup), OPT(Prescriptiveness), Size> t;
|
||||
};
|
||||
|
||||
// V5.2: [5.8.4] `enter` clause
|
||||
template <typename T, typename I, typename E> //
|
||||
struct EnterT {
|
||||
@ -1263,11 +1272,12 @@ template <typename T, typename I, typename E>
|
||||
using TupleClausesT =
|
||||
std::variant<AffinityT<T, I, E>, AlignedT<T, I, E>, AllocateT<T, I, E>,
|
||||
DefaultmapT<T, I, E>, DeviceT<T, I, E>, DistScheduleT<T, I, E>,
|
||||
DoacrossT<T, I, E>, FromT<T, I, E>, GrainsizeT<T, I, E>,
|
||||
IfT<T, I, E>, InitT<T, I, E>, InReductionT<T, I, E>,
|
||||
LastprivateT<T, I, E>, LinearT<T, I, E>, MapT<T, I, E>,
|
||||
NumTasksT<T, I, E>, OrderT<T, I, E>, ReductionT<T, I, E>,
|
||||
ScheduleT<T, I, E>, TaskReductionT<T, I, E>, ToT<T, I, E>>;
|
||||
DoacrossT<T, I, E>, DynGroupprivateT<T, I, E>, FromT<T, I, E>,
|
||||
GrainsizeT<T, I, E>, IfT<T, I, E>, InitT<T, I, E>,
|
||||
InReductionT<T, I, E>, LastprivateT<T, I, E>, LinearT<T, I, E>,
|
||||
MapT<T, I, E>, NumTasksT<T, I, E>, OrderT<T, I, E>,
|
||||
ReductionT<T, I, E>, ScheduleT<T, I, E>,
|
||||
TaskReductionT<T, I, E>, ToT<T, I, E>>;
|
||||
|
||||
template <typename T, typename I, typename E>
|
||||
using UnionClausesT = std::variant<DependT<T, I, E>>;
|
||||
|
||||
@ -178,6 +178,9 @@ def OMPC_Doacross : Clause<[Spelling<"doacross">]> {
|
||||
def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> {
|
||||
let clangClass = "OMPDynamicAllocatorsClause";
|
||||
}
|
||||
def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> {
|
||||
let flangClass = "OmpDynGroupprivateClause";
|
||||
}
|
||||
def OMPC_Enter : Clause<[Spelling<"enter">]> {
|
||||
let flangClass = "OmpEnterClause";
|
||||
}
|
||||
@ -1104,6 +1107,7 @@ def OMP_Target : Directive<[Spelling<"target">]> {
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_If>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_OMPX_Bare>,
|
||||
@ -1254,6 +1258,7 @@ def OMP_Teams : Directive<[Spelling<"teams">]> {
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_If, 52>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_ThreadLimit>,
|
||||
@ -1522,6 +1527,7 @@ def OMP_target_loop : Directive<[Spelling<"target loop">]> {
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_Bind, 50>,
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_Order>,
|
||||
VersionedClause<OMPC_ThreadLimit>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
@ -1983,6 +1989,7 @@ def OMP_TargetParallel : Directive<[Spelling<"target parallel">]> {
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
VersionedClause<OMPC_ProcBind>,
|
||||
@ -2012,6 +2019,7 @@ def OMP_TargetParallelDo : Directive<[Spelling<"target parallel do">]> {
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
VersionedClause<OMPC_Order, 50>,
|
||||
@ -2054,6 +2062,9 @@ def OMP_TargetParallelDoSimd
|
||||
VersionedClause<OMPC_SimdLen>,
|
||||
VersionedClause<OMPC_UsesAllocators>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
];
|
||||
let leafConstructs = [OMP_Target, OMP_Parallel, OMP_Do, OMP_Simd];
|
||||
let category = CA_Executable;
|
||||
let languages = [L_Fortran];
|
||||
@ -2086,6 +2097,7 @@ def OMP_TargetParallelFor : Directive<[Spelling<"target parallel for">]> {
|
||||
VersionedClause<OMPC_UsesAllocators, 50>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
VersionedClause<OMPC_ThreadLimit, 51>,
|
||||
];
|
||||
@ -2126,6 +2138,7 @@ def OMP_TargetParallelForSimd
|
||||
VersionedClause<OMPC_UsesAllocators, 50>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
VersionedClause<OMPC_ThreadLimit, 51>,
|
||||
];
|
||||
@ -2155,6 +2168,7 @@ def OMP_target_parallel_loop : Directive<[Spelling<"target parallel loop">]> {
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
@ -2189,6 +2203,7 @@ def OMP_TargetSimd : Directive<[Spelling<"target simd">]> {
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
VersionedClause<OMPC_Order, 50>,
|
||||
@ -2220,6 +2235,7 @@ def OMP_TargetTeams : Directive<[Spelling<"target teams">]> {
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
@ -2252,6 +2268,7 @@ def OMP_TargetTeamsDistribute
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DistSchedule>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
@ -2284,6 +2301,7 @@ def OMP_TargetTeamsDistributeParallelDo
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DistSchedule>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
@ -2322,6 +2340,7 @@ def OMP_TargetTeamsDistributeParallelDoSimd
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DistSchedule>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
@ -2367,6 +2386,7 @@ def OMP_TargetTeamsDistributeParallelFor
|
||||
VersionedClause<OMPC_UsesAllocators, 50>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
];
|
||||
let leafConstructs =
|
||||
@ -2409,6 +2429,7 @@ def OMP_TargetTeamsDistributeParallelForSimd
|
||||
VersionedClause<OMPC_UsesAllocators, 50>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
];
|
||||
let leafConstructs =
|
||||
@ -2441,6 +2462,7 @@ def OMP_TargetTeamsDistributeSimd
|
||||
VersionedClause<OMPC_DefaultMap>,
|
||||
VersionedClause<OMPC_Device>,
|
||||
VersionedClause<OMPC_DistSchedule>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
@ -2474,6 +2496,7 @@ def OMP_target_teams_loop : Directive<[Spelling<"target teams loop">]> {
|
||||
VersionedClause<OMPC_Bind, 50>,
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NoWait>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_OMPX_DynCGroupMem>,
|
||||
@ -2532,6 +2555,7 @@ def OMP_TeamsDistribute : Directive<[Spelling<"teams distribute">]> {
|
||||
VersionedClause<OMPC_ThreadLimit>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_If>,
|
||||
VersionedClause<OMPC_Order, 50>,
|
||||
];
|
||||
@ -2555,6 +2579,7 @@ def OMP_TeamsDistributeParallelDo
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DistSchedule>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
VersionedClause<OMPC_Order, 50>,
|
||||
@ -2584,6 +2609,7 @@ def OMP_TeamsDistributeParallelDoSimd
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DistSchedule>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_NumThreads>,
|
||||
VersionedClause<OMPC_Order, 50>,
|
||||
@ -2620,6 +2646,9 @@ def OMP_TeamsDistributeParallelFor
|
||||
VersionedClause<OMPC_Shared>,
|
||||
VersionedClause<OMPC_ThreadLimit>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
];
|
||||
let leafConstructs = [OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For];
|
||||
let category = CA_Executable;
|
||||
let languages = [L_C];
|
||||
@ -2650,6 +2679,9 @@ def OMP_TeamsDistributeParallelForSimd
|
||||
VersionedClause<OMPC_SimdLen>,
|
||||
VersionedClause<OMPC_ThreadLimit>,
|
||||
];
|
||||
let allowedOnceClauses = [
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
];
|
||||
let leafConstructs =
|
||||
[OMP_Teams, OMP_Distribute, OMP_Parallel, OMP_For, OMP_Simd];
|
||||
let category = CA_Executable;
|
||||
@ -2673,6 +2705,7 @@ def OMP_TeamsDistributeSimd : Directive<[Spelling<"teams distribute simd">]> {
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DistSchedule>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_Order, 50>,
|
||||
VersionedClause<OMPC_SafeLen>,
|
||||
@ -2696,6 +2729,7 @@ def OMP_teams_loop : Directive<[Spelling<"teams loop">]> {
|
||||
VersionedClause<OMPC_Bind, 50>,
|
||||
VersionedClause<OMPC_Collapse>,
|
||||
VersionedClause<OMPC_Default>,
|
||||
VersionedClause<OMPC_DynGroupprivate, 61>,
|
||||
VersionedClause<OMPC_NumTeams>,
|
||||
VersionedClause<OMPC_Order>,
|
||||
VersionedClause<OMPC_ThreadLimit>,
|
||||
|
||||
@ -493,6 +493,9 @@ private:
|
||||
/// and we only need to check individual instructions.
|
||||
bool canVectorizeInstrs();
|
||||
|
||||
/// Check if an individual instruction is vectorizable.
|
||||
bool canVectorizeInstr(Instruction &I);
|
||||
|
||||
/// When we vectorize loops we may change the order in which
|
||||
/// we read and write from memory. This method checks if it is
|
||||
/// legal to vectorize the code, considering only memory constrains.
|
||||
|
||||
@ -76,6 +76,26 @@ void DemandedBits::determineLiveOperandBits(
|
||||
computeKnownBits(V2, Known2, DL, &AC, UserI, &DT);
|
||||
}
|
||||
};
|
||||
auto GetShiftedRange = [&](uint64_t Min, uint64_t Max, bool ShiftLeft) {
|
||||
auto ShiftF = [ShiftLeft](const APInt &Mask, unsigned ShiftAmnt) {
|
||||
return ShiftLeft ? Mask.shl(ShiftAmnt) : Mask.lshr(ShiftAmnt);
|
||||
};
|
||||
AB = APInt::getZero(BitWidth);
|
||||
uint64_t LoopRange = Max - Min;
|
||||
APInt Mask = AOut;
|
||||
APInt Shifted = AOut; // AOut | (AOut << 1) | ... | (AOut << (ShiftAmnt - 1)
|
||||
for (unsigned ShiftAmnt = 1; ShiftAmnt <= LoopRange; ShiftAmnt <<= 1) {
|
||||
if (LoopRange & ShiftAmnt) {
|
||||
// Account for (LoopRange - ShiftAmnt, LoopRange]
|
||||
Mask |= ShiftF(Shifted, LoopRange - ShiftAmnt + 1);
|
||||
// Clears the low bit.
|
||||
LoopRange -= ShiftAmnt;
|
||||
}
|
||||
// [0, ShiftAmnt) -> [0, ShiftAmnt * 2)
|
||||
Shifted |= ShiftF(Shifted, ShiftAmnt);
|
||||
}
|
||||
AB = ShiftF(Mask, Min);
|
||||
};
|
||||
|
||||
switch (UserI->getOpcode()) {
|
||||
default: break;
|
||||
@ -183,6 +203,17 @@ void DemandedBits::determineLiveOperandBits(
|
||||
AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
|
||||
else if (S->hasNoUnsignedWrap())
|
||||
AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
|
||||
} else {
|
||||
ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
|
||||
uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
|
||||
uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
|
||||
// similar to Lshr case
|
||||
GetShiftedRange(Min, Max, /*ShiftLeft=*/false);
|
||||
const auto *S = cast<ShlOperator>(UserI);
|
||||
if (S->hasNoSignedWrap())
|
||||
AB |= APInt::getHighBitsSet(BitWidth, Max + 1);
|
||||
else if (S->hasNoUnsignedWrap())
|
||||
AB |= APInt::getHighBitsSet(BitWidth, Max);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -197,6 +228,24 @@ void DemandedBits::determineLiveOperandBits(
|
||||
// (they must be zero).
|
||||
if (cast<LShrOperator>(UserI)->isExact())
|
||||
AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
|
||||
} else {
|
||||
ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
|
||||
uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
|
||||
uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
|
||||
// Suppose AOut == 0b0000 0001
|
||||
// [min, max] = [1, 3]
|
||||
// iteration 1 shift by 1 mask is 0b0000 0011
|
||||
// iteration 2 shift by 2 mask is 0b0000 1111
|
||||
// iteration 3, shiftAmnt = 4 > max - min, we stop.
|
||||
//
|
||||
// After the iterations we need one more shift by min,
|
||||
// to move from 0b0000 1111 to --> 0b0001 1110.
|
||||
// The loop populates the mask relative to (0,...,max-min),
|
||||
// but we need coverage from (min, max).
|
||||
// This is why the shift by min is needed.
|
||||
GetShiftedRange(Min, Max, /*ShiftLeft=*/true);
|
||||
if (cast<LShrOperator>(UserI)->isExact())
|
||||
AB |= APInt::getLowBitsSet(BitWidth, Max);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -217,6 +266,26 @@ void DemandedBits::determineLiveOperandBits(
|
||||
// (they must be zero).
|
||||
if (cast<AShrOperator>(UserI)->isExact())
|
||||
AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
|
||||
} else {
|
||||
ComputeKnownBits(BitWidth, UserI->getOperand(1), nullptr);
|
||||
uint64_t Min = Known.getMinValue().getLimitedValue(BitWidth - 1);
|
||||
uint64_t Max = Known.getMaxValue().getLimitedValue(BitWidth - 1);
|
||||
GetShiftedRange(Min, Max, /*ShiftLeft=*/true);
|
||||
if (Max &&
|
||||
(AOut & APInt::getHighBitsSet(BitWidth, Max)).getBoolValue()) {
|
||||
// Suppose AOut = 0011 1100
|
||||
// [min, max] = [1, 3]
|
||||
// ShiftAmount = 1 : Mask is 1000 0000
|
||||
// ShiftAmount = 2 : Mask is 1100 0000
|
||||
// ShiftAmount = 3 : Mask is 1110 0000
|
||||
// The Mask with Max covers every case in [min, max],
|
||||
// so we are done
|
||||
AB.setSignBit();
|
||||
}
|
||||
// If the shift is exact, then the low bits are not dead
|
||||
// (they must be zero).
|
||||
if (cast<AShrOperator>(UserI)->isExact())
|
||||
AB |= APInt::getLowBitsSet(BitWidth, Max);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@ -180,6 +180,10 @@ static cl::opt<bool> DisableGEPConstOperand(
|
||||
"disable-gep-const-evaluation", cl::Hidden, cl::init(false),
|
||||
cl::desc("Disables evaluation of GetElementPtr with constant operands"));
|
||||
|
||||
static cl::opt<bool> InlineAllViableCalls(
|
||||
"inline-all-viable-calls", cl::Hidden, cl::init(false),
|
||||
cl::desc("Inline all viable calls, even if they exceed the inlining "
|
||||
"threshold"));
|
||||
namespace llvm {
|
||||
std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
|
||||
if (Attr.isValid()) {
|
||||
@ -3272,6 +3276,10 @@ InlineCost llvm::getInlineCost(
|
||||
return llvm::InlineCost::getNever(UserDecision->getFailureReason());
|
||||
}
|
||||
|
||||
if (InlineAllViableCalls && isInlineViable(*Callee).isSuccess())
|
||||
return llvm::InlineCost::getAlways(
|
||||
"Inlining forced by -inline-all-viable-calls");
|
||||
|
||||
LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
|
||||
<< "... (caller:" << Call.getCaller()->getName()
|
||||
<< ")\n");
|
||||
|
||||
@ -4002,7 +4002,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
|
||||
case Intrinsic::amdgcn_rcp_legacy:
|
||||
case Intrinsic::amdgcn_rsq_legacy:
|
||||
case Intrinsic::amdgcn_rsq_clamp:
|
||||
case Intrinsic::amdgcn_tanh: {
|
||||
case Intrinsic::amdgcn_tanh:
|
||||
case Intrinsic::amdgcn_prng_b32: {
|
||||
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
|
||||
SDValue Src = N->getOperand(1);
|
||||
return Src.isUndef() ? Src : SDValue();
|
||||
|
||||
@ -2366,8 +2366,12 @@ static bool containsBufferFatPointers(const Function &F,
|
||||
BufferFatPtrToStructTypeMap *TypeMap) {
|
||||
bool HasFatPointers = false;
|
||||
for (const BasicBlock &BB : F)
|
||||
for (const Instruction &I : BB)
|
||||
for (const Instruction &I : BB) {
|
||||
HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType()));
|
||||
// Catch null pointer constants in loads, stores, etc.
|
||||
for (const Value *V : I.operand_values())
|
||||
HasFatPointers |= (V->getType() != TypeMap->remapType(V->getType()));
|
||||
}
|
||||
return HasFatPointers;
|
||||
}
|
||||
|
||||
|
||||
@ -389,6 +389,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
|
||||
Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) &&
|
||||
// Matrix B format operand reuses op_sel_hi.
|
||||
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_fmt) &&
|
||||
// Matrix B scale operand reuses op_sel_hi.
|
||||
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_scale) &&
|
||||
// Matrix B reuse operand reuses op_sel_hi.
|
||||
!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) {
|
||||
Encoding |= getImplicitOpSelHiEncoding(Opcode);
|
||||
|
||||
@ -3056,8 +3056,6 @@ def : GCNPat<
|
||||
}
|
||||
} // AddedComplexity = 1
|
||||
|
||||
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
|
||||
let True16Predicate = p in {
|
||||
def : GCNPat<
|
||||
(i32 (DivergentUnaryFrag<zext> i16:$src)),
|
||||
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
|
||||
@ -3073,26 +3071,6 @@ def : GCNPat<
|
||||
def : GCNPat<
|
||||
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
|
||||
(COPY VSrc_b16:$src)>;
|
||||
}
|
||||
|
||||
let True16Predicate = UseRealTrue16Insts in {
|
||||
def : GCNPat<
|
||||
(i32 (DivergentUnaryFrag<zext> i16:$src)),
|
||||
(REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
(i64 (DivergentUnaryFrag<zext> i16:$src)),
|
||||
(REG_SEQUENCE VReg_64,
|
||||
(REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16), sub0,
|
||||
(S_MOV_B32 (i32 0)), sub1)
|
||||
>;
|
||||
|
||||
def : GCNPat<
|
||||
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
|
||||
(REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
|
||||
>;
|
||||
}
|
||||
|
||||
def : GCNPat <
|
||||
(i32 (trunc i64:$a)),
|
||||
|
||||
@ -736,7 +736,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
|
||||
}
|
||||
case TargetOpcode::G_FCONSTANT: {
|
||||
// TODO: Use constant pool for complex constants.
|
||||
// TODO: Optimize +0.0 to use fcvt.d.w for s64 on rv32.
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
const APFloat &FPimm = MI.getOperand(1).getFPImm()->getValueAPF();
|
||||
APInt Imm = FPimm.bitcastToAPInt();
|
||||
@ -753,8 +752,22 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
|
||||
if (!FMV.constrainAllUses(TII, TRI, RBI))
|
||||
return false;
|
||||
} else {
|
||||
// s64 on rv32
|
||||
assert(Size == 64 && !Subtarget->is64Bit() &&
|
||||
"Unexpected size or subtarget");
|
||||
|
||||
if (Imm.isNonNegative() && Imm.isZero()) {
|
||||
// Optimize +0.0 to use fcvt.d.w
|
||||
MachineInstrBuilder FCVT =
|
||||
MIB.buildInstr(RISCV::FCVT_D_W, {DstReg}, {Register(RISCV::X0)})
|
||||
.addImm(RISCVFPRndMode::RNE);
|
||||
if (!FCVT.constrainAllUses(TII, TRI, RBI))
|
||||
return false;
|
||||
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Split into two pieces and build through the stack.
|
||||
Register GPRRegHigh = MRI->createVirtualRegister(&RISCV::GPRRegClass);
|
||||
Register GPRRegLow = MRI->createVirtualRegister(&RISCV::GPRRegClass);
|
||||
|
||||
@ -38,8 +38,15 @@ struct CapabilityEntry {
|
||||
Capability::Capability ReqCapability;
|
||||
};
|
||||
|
||||
struct EnvironmentEntry {
|
||||
OperandCategory::OperandCategory Category;
|
||||
uint32_t Value;
|
||||
Environment::Environment AllowedEnvironment;
|
||||
};
|
||||
|
||||
using namespace OperandCategory;
|
||||
using namespace Extension;
|
||||
using namespace Environment;
|
||||
using namespace Capability;
|
||||
using namespace InstructionSet;
|
||||
#define GET_SymbolicOperands_DECL
|
||||
@ -48,6 +55,8 @@ using namespace InstructionSet;
|
||||
#define GET_ExtensionEntries_IMPL
|
||||
#define GET_CapabilityEntries_DECL
|
||||
#define GET_CapabilityEntries_IMPL
|
||||
#define GET_EnvironmentEntries_DECL
|
||||
#define GET_EnvironmentEntries_IMPL
|
||||
#define GET_ExtendedBuiltins_DECL
|
||||
#define GET_ExtendedBuiltins_IMPL
|
||||
#include "SPIRVGenTables.inc"
|
||||
@ -133,6 +142,23 @@ getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category,
|
||||
return Capabilities;
|
||||
}
|
||||
|
||||
EnvironmentList getSymbolicOperandAllowedEnvironments(
|
||||
SPIRV::OperandCategory::OperandCategory Category, uint32_t Value) {
|
||||
EnvironmentList Environments;
|
||||
const SPIRV::EnvironmentEntry *Environment =
|
||||
SPIRV::lookupEnvironmentByCategoryAndValue(Category, Value);
|
||||
auto TableEnd = ArrayRef(SPIRV::EnvironmentEntries).end();
|
||||
while (Environment && Environment->Category == Category &&
|
||||
Environment->Value == Value) {
|
||||
Environments.push_back(static_cast<SPIRV::Environment::Environment>(
|
||||
Environment->AllowedEnvironment));
|
||||
if (++Environment == TableEnd)
|
||||
break;
|
||||
}
|
||||
|
||||
return Environments;
|
||||
}
|
||||
|
||||
CapabilityList
|
||||
getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension) {
|
||||
const SPIRV::ExtensionEntry *Entry =
|
||||
|
||||
@ -37,6 +37,11 @@ namespace Capability {
|
||||
#include "SPIRVGenTables.inc"
|
||||
} // namespace Capability
|
||||
|
||||
namespace Environment {
|
||||
#define GET_Environment_DECL
|
||||
#include "SPIRVGenTables.inc"
|
||||
} // namespace Environment
|
||||
|
||||
namespace SourceLanguage {
|
||||
#define GET_SourceLanguage_DECL
|
||||
#include "SPIRVGenTables.inc"
|
||||
@ -241,6 +246,7 @@ enum InstFlags {
|
||||
|
||||
using CapabilityList = SmallVector<SPIRV::Capability::Capability, 8>;
|
||||
using ExtensionList = SmallVector<SPIRV::Extension::Extension, 8>;
|
||||
using EnvironmentList = SmallVector<SPIRV::Environment::Environment, 8>;
|
||||
|
||||
std::string
|
||||
getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category,
|
||||
@ -254,6 +260,8 @@ getSymbolicOperandMaxVersion(SPIRV::OperandCategory::OperandCategory Category,
|
||||
CapabilityList
|
||||
getSymbolicOperandCapabilities(SPIRV::OperandCategory::OperandCategory Category,
|
||||
uint32_t Value);
|
||||
EnvironmentList getSymbolicOperandAllowedEnvironments(
|
||||
SPIRV::OperandCategory::OperandCategory Category, uint32_t Value);
|
||||
CapabilityList
|
||||
getCapabilitiesEnabledByExtension(SPIRV::Extension::Extension Extension);
|
||||
ExtensionList
|
||||
|
||||
@ -12,7 +12,8 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "SPIRVCommandLine.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "MCTargetDesc/SPIRVBaseInfo.h"
|
||||
#include "llvm/TargetParser/Triple.h"
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
|
||||
@ -171,3 +172,23 @@ StringRef SPIRVExtensionsParser::checkExtensions(
|
||||
}
|
||||
return StringRef();
|
||||
}
|
||||
|
||||
std::set<SPIRV::Extension::Extension>
|
||||
SPIRVExtensionsParser::getValidExtensions(const Triple &TT) {
|
||||
std::set<SPIRV::Extension::Extension> R;
|
||||
SPIRV::Environment::Environment CurrentEnvironment =
|
||||
SPIRV::Environment::Environment::EnvOpenCL;
|
||||
if (TT.getOS() == Triple::Vulkan)
|
||||
CurrentEnvironment = SPIRV::Environment::Environment::EnvVulkan;
|
||||
|
||||
for (const auto &[ExtensionName, ExtensionEnum] : SPIRVExtensionMap) {
|
||||
EnvironmentList AllowedEnv = getSymbolicOperandAllowedEnvironments(
|
||||
SPIRV::OperandCategory::OperandCategory::ExtensionOperand,
|
||||
ExtensionEnum);
|
||||
|
||||
if (std::count(AllowedEnv.begin(), AllowedEnv.end(), CurrentEnvironment))
|
||||
R.insert(ExtensionEnum);
|
||||
}
|
||||
|
||||
return R;
|
||||
}
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
|
||||
namespace llvm {
|
||||
class StringRef;
|
||||
class Triple;
|
||||
|
||||
/// Command line parser for toggling SPIR-V extensions.
|
||||
struct SPIRVExtensionsParser
|
||||
@ -42,6 +43,11 @@ public:
|
||||
static StringRef
|
||||
checkExtensions(const std::vector<std::string> &ExtNames,
|
||||
std::set<SPIRV::Extension::Extension> &AllowedExtensions);
|
||||
|
||||
/// Returns the list of extensions that are valid for a particular
|
||||
/// target environment (i.e., OpenCL or Vulkan).
|
||||
static std::set<SPIRV::Extension::Extension>
|
||||
getValidExtensions(const Triple &TT);
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
@ -166,7 +166,13 @@ void SPIRVSubtarget::initAvailableExtInstSets() {
|
||||
void SPIRVSubtarget::initAvailableExtensions(
|
||||
const std::set<SPIRV::Extension::Extension> &AllowedExtIds) {
|
||||
AvailableExtensions.clear();
|
||||
AvailableExtensions.insert_range(AllowedExtIds);
|
||||
const std::set<SPIRV::Extension::Extension> &ValidExtensions =
|
||||
SPIRVExtensionsParser::getValidExtensions(TargetTriple);
|
||||
|
||||
for (const auto &Ext : AllowedExtIds) {
|
||||
if (ValidExtensions.count(Ext))
|
||||
AvailableExtensions.insert(Ext);
|
||||
}
|
||||
|
||||
accountForAMDShaderTrinaryMinmax();
|
||||
}
|
||||
|
||||
@ -109,23 +109,59 @@ def CapabilityEntries : GenericTable {
|
||||
let PrimaryKeyName = "lookupCapabilityByCategoryAndValue";
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Lookup table for matching symbolic operands (category + 32-bit value) to
|
||||
// SPIR-V environments. If an operand is allows in more than one environment,
|
||||
// there will be multiple consecutive entries present in the table.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Forward-declare classes used in ExtensionEntry
|
||||
class Environment;
|
||||
|
||||
class EnvironmentEntry<OperandCategory category, bits<32> value,
|
||||
Environment allowedEnvironment> {
|
||||
OperandCategory Category = category;
|
||||
bits<32> Value = value;
|
||||
Environment AllowedEnvironment = allowedEnvironment;
|
||||
}
|
||||
|
||||
def EnvironmentEntries : GenericTable {
|
||||
let FilterClass = "EnvironmentEntry";
|
||||
let Fields = ["Category", "Value", "AllowedEnvironment"];
|
||||
string TypeOf_Category = "OperandCategory";
|
||||
string TypeOf_AllowedEnvironment = "Environment";
|
||||
let PrimaryKey = ["Category", "Value"];
|
||||
// Function for looking up a (the first) environment by category + value. Next
|
||||
// environment should be consecutive.
|
||||
let PrimaryKeyName = "lookupEnvironmentByCategoryAndValue";
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Multiclass used to define a SymbolicOperand and at the same time declare
|
||||
// required extension and capabilities.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
multiclass SymbolicOperandWithRequirements<OperandCategory category, bits<32> value, string mnemonic, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided for symbolic operand with value " # value;
|
||||
def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>;
|
||||
multiclass SymbolicOperandWithRequirements<
|
||||
OperandCategory category, bits<32> value, string mnemonic,
|
||||
bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions,
|
||||
list<Capability> reqCapabilities, list<Environment> allowedEnvironments> {
|
||||
assert !ge(!size(mnemonic), 1), "No mnemonic/string representation provided "
|
||||
"for symbolic operand with value "#value;
|
||||
def : SymbolicOperand<category, value, mnemonic, minVersion, maxVersion>;
|
||||
|
||||
assert !le(!size(reqExtensions), 1), "Too many required extensions for a symbolic/named operand: " # mnemonic;
|
||||
if !eq(!size(reqExtensions), 1) then {
|
||||
def : ExtensionEntry<category, value, reqExtensions[0]>;
|
||||
}
|
||||
assert !le(!size(reqExtensions), 1),
|
||||
"Too many required extensions for a symbolic/named operand: "#mnemonic;
|
||||
if !eq(!size(reqExtensions), 1) then {
|
||||
def : ExtensionEntry<category, value, reqExtensions[0]>;
|
||||
}
|
||||
|
||||
foreach capability = reqCapabilities in {
|
||||
def : CapabilityEntry<category, value, capability>;
|
||||
}
|
||||
foreach capability = reqCapabilities in {
|
||||
def : CapabilityEntry<category, value, capability>;
|
||||
}
|
||||
|
||||
foreach environment = allowedEnvironments in {
|
||||
def : EnvironmentEntry<category, value, environment>;
|
||||
}
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -175,6 +211,20 @@ def CooperativeMatrixOperandsOperand : OperandCategory;
|
||||
def SpecConstantOpOperandsOperand : OperandCategory;
|
||||
def MatrixMultiplyAccumulateOperandsOperand : OperandCategory;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Definition of the Environments
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def Environment : GenericEnum, Operand<i32> {
|
||||
let FilterClass = "Environment";
|
||||
let ValueField = "Value";
|
||||
}
|
||||
|
||||
class Environment<bits<32> value> { bits<32> Value = value; }
|
||||
|
||||
def EnvOpenCL : Environment<0>;
|
||||
def EnvVulkan : Environment<1>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Multiclass used to define Extesions enum values and at the same time
|
||||
// SymbolicOperand entries.
|
||||
@ -192,135 +242,146 @@ class Extension<string name, bits<32> value> {
|
||||
bits<32> Value = value;
|
||||
}
|
||||
|
||||
multiclass ExtensionOperand<bits<32> value> {
|
||||
multiclass ExtensionOperand<bits<32> value,
|
||||
list<Environment> allowedEnvironments> {
|
||||
def NAME : Extension<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0, 0, [], []>;
|
||||
defm : SymbolicOperandWithRequirements<ExtensionOperand, value, NAME, 0,
|
||||
0, [], [], allowedEnvironments>;
|
||||
}
|
||||
|
||||
defm SPV_AMD_shader_explicit_vertex_parameter : ExtensionOperand<1>;
|
||||
defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2>;
|
||||
defm SPV_AMD_gcn_shader : ExtensionOperand<3>;
|
||||
defm SPV_KHR_shader_ballot : ExtensionOperand<4>;
|
||||
defm SPV_AMD_shader_ballot : ExtensionOperand<5>;
|
||||
defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6>;
|
||||
defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7>;
|
||||
defm SPV_KHR_subgroup_vote : ExtensionOperand<8>;
|
||||
defm SPV_KHR_16bit_storage : ExtensionOperand<9>;
|
||||
defm SPV_KHR_device_group : ExtensionOperand<10>;
|
||||
defm SPV_KHR_multiview : ExtensionOperand<11>;
|
||||
defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12>;
|
||||
defm SPV_NV_viewport_array2 : ExtensionOperand<13>;
|
||||
defm SPV_NV_stereo_view_rendering : ExtensionOperand<14>;
|
||||
defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15>;
|
||||
defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16>;
|
||||
defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17>;
|
||||
defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18>;
|
||||
defm SPV_KHR_variable_pointers : ExtensionOperand<19>;
|
||||
defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20>;
|
||||
defm SPV_KHR_post_depth_coverage : ExtensionOperand<21>;
|
||||
defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22>;
|
||||
defm SPV_EXT_shader_stencil_export : ExtensionOperand<23>;
|
||||
defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24>;
|
||||
defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25>;
|
||||
defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26>;
|
||||
defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27>;
|
||||
defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28>;
|
||||
defm SPV_GOOGLE_decorate_string : ExtensionOperand<29>;
|
||||
defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30>;
|
||||
defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31>;
|
||||
defm SPV_EXT_descriptor_indexing : ExtensionOperand<32>;
|
||||
defm SPV_KHR_8bit_storage : ExtensionOperand<33>;
|
||||
defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34>;
|
||||
defm SPV_NV_ray_tracing : ExtensionOperand<35>;
|
||||
defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36>;
|
||||
defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37>;
|
||||
defm SPV_NV_mesh_shader : ExtensionOperand<38>;
|
||||
defm SPV_NV_shader_image_footprint : ExtensionOperand<39>;
|
||||
defm SPV_NV_shading_rate : ExtensionOperand<40>;
|
||||
defm SPV_INTEL_subgroups : ExtensionOperand<41>;
|
||||
defm SPV_INTEL_media_block_io : ExtensionOperand<42>;
|
||||
defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44>;
|
||||
defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45>;
|
||||
defm SPV_KHR_float_controls : ExtensionOperand<46>;
|
||||
defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47>;
|
||||
defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48>;
|
||||
defm SPV_NV_cooperative_matrix : ExtensionOperand<49>;
|
||||
defm SPV_INTEL_shader_integer_functions2 : ExtensionOperand<50>;
|
||||
defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51>;
|
||||
defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52>;
|
||||
defm SPV_NV_shader_sm_builtins : ExtensionOperand<53>;
|
||||
defm SPV_KHR_shader_clock : ExtensionOperand<54>;
|
||||
defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55>;
|
||||
defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56>;
|
||||
defm SPV_INTEL_fpga_reg : ExtensionOperand<57>;
|
||||
defm SPV_INTEL_blocking_pipes : ExtensionOperand<58>;
|
||||
defm SPV_GOOGLE_user_type : ExtensionOperand<59>;
|
||||
defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60>;
|
||||
defm SPV_INTEL_kernel_attributes : ExtensionOperand<61>;
|
||||
defm SPV_KHR_non_semantic_info : ExtensionOperand<62>;
|
||||
defm SPV_INTEL_io_pipes : ExtensionOperand<63>;
|
||||
defm SPV_KHR_ray_tracing : ExtensionOperand<64>;
|
||||
defm SPV_KHR_ray_query : ExtensionOperand<65>;
|
||||
defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66>;
|
||||
defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67>;
|
||||
defm SPV_EXT_shader_atomic_float_add : ExtensionOperand<68>;
|
||||
defm SPV_KHR_terminate_invocation : ExtensionOperand<69>;
|
||||
defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70>;
|
||||
defm SPV_EXT_shader_image_int64 : ExtensionOperand<71>;
|
||||
defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72>;
|
||||
defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73>;
|
||||
defm SPV_INTEL_loop_fuse : ExtensionOperand<74>;
|
||||
defm SPV_EXT_shader_atomic_float_min_max : ExtensionOperand<75>;
|
||||
defm SPV_KHR_workgroup_memory_explicit_layout : ExtensionOperand<76>;
|
||||
defm SPV_KHR_linkonce_odr : ExtensionOperand<77>;
|
||||
defm SPV_KHR_expect_assume : ExtensionOperand<78>;
|
||||
defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79>;
|
||||
defm SPV_NV_bindless_texture : ExtensionOperand<80>;
|
||||
defm SPV_INTEL_fpga_invocation_pipelining_attributes : ExtensionOperand<81>;
|
||||
defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82>;
|
||||
defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83>;
|
||||
defm SPV_KHR_integer_dot_product : ExtensionOperand<84>;
|
||||
defm SPV_EXT_shader_atomic_float16_add : ExtensionOperand<85>;
|
||||
defm SPV_INTEL_runtime_aligned : ExtensionOperand<86>;
|
||||
defm SPV_KHR_bit_instructions : ExtensionOperand<87>;
|
||||
defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88>;
|
||||
defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89>;
|
||||
defm SPV_KHR_subgroup_rotate : ExtensionOperand<90>;
|
||||
defm SPV_INTEL_split_barrier : ExtensionOperand<91>;
|
||||
defm SPV_KHR_ray_cull_mask : ExtensionOperand<92>;
|
||||
defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93>;
|
||||
defm SPV_EXT_relaxed_printf_string_address_space : ExtensionOperand<94>;
|
||||
defm SPV_EXT_ycbcr_attachments : ExtensionOperand<95>;
|
||||
defm SPV_EXT_mesh_shader : ExtensionOperand<96>;
|
||||
defm SPV_ARM_core_builtins : ExtensionOperand<97>;
|
||||
defm SPV_EXT_opacity_micromap : ExtensionOperand<98>;
|
||||
defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99>;
|
||||
defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100>;
|
||||
defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>;
|
||||
defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>;
|
||||
defm SPV_INTEL_optnone : ExtensionOperand<103>;
|
||||
defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
|
||||
defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
|
||||
defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>;
|
||||
defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
|
||||
defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
|
||||
defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
|
||||
defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
|
||||
defm SPV_KHR_cooperative_matrix : ExtensionOperand<111>;
|
||||
defm SPV_EXT_arithmetic_fence : ExtensionOperand<112>;
|
||||
defm SPV_EXT_optnone : ExtensionOperand<113>;
|
||||
defm SPV_INTEL_joint_matrix : ExtensionOperand<114>;
|
||||
defm SPV_INTEL_float_controls2 : ExtensionOperand<115>;
|
||||
defm SPV_INTEL_bindless_images : ExtensionOperand<116>;
|
||||
defm SPV_INTEL_long_composites : ExtensionOperand<117>;
|
||||
defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118>;
|
||||
defm SPV_INTEL_fp_max_error : ExtensionOperand<119>;
|
||||
defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120>;
|
||||
defm SPV_INTEL_subgroup_matrix_multiply_accumulate : ExtensionOperand<121>;
|
||||
defm SPV_INTEL_2d_block_io : ExtensionOperand<122>;
|
||||
defm SPV_INTEL_int4 : ExtensionOperand<123>;
|
||||
defm SPV_KHR_float_controls2 : ExtensionOperand<124>;
|
||||
defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125>;
|
||||
defm SPV_AMD_shader_explicit_vertex_parameter
|
||||
: ExtensionOperand<1, [EnvVulkan]>;
|
||||
defm SPV_AMD_shader_trinary_minmax_extension : ExtensionOperand<2, [EnvVulkan]>;
|
||||
defm SPV_AMD_gcn_shader : ExtensionOperand<3, [EnvVulkan]>;
|
||||
defm SPV_KHR_shader_ballot : ExtensionOperand<4, [EnvVulkan]>;
|
||||
defm SPV_AMD_shader_ballot : ExtensionOperand<5, [EnvVulkan]>;
|
||||
defm SPV_AMD_gpu_shader_half_float : ExtensionOperand<6, [EnvVulkan]>;
|
||||
defm SPV_KHR_shader_draw_parameters : ExtensionOperand<7, [EnvVulkan]>;
|
||||
defm SPV_KHR_subgroup_vote : ExtensionOperand<8, [EnvVulkan]>;
|
||||
defm SPV_KHR_16bit_storage : ExtensionOperand<9, [EnvVulkan]>;
|
||||
defm SPV_KHR_device_group : ExtensionOperand<10, [EnvVulkan]>;
|
||||
defm SPV_KHR_multiview : ExtensionOperand<11, [EnvVulkan]>;
|
||||
defm SPV_NVX_multiview_per_view_attributes : ExtensionOperand<12, [EnvVulkan]>;
|
||||
defm SPV_NV_viewport_array2 : ExtensionOperand<13, [EnvVulkan]>;
|
||||
defm SPV_NV_stereo_view_rendering : ExtensionOperand<14, [EnvVulkan]>;
|
||||
defm SPV_NV_sample_mask_override_coverage : ExtensionOperand<15, [EnvVulkan]>;
|
||||
defm SPV_NV_geometry_shader_passthrough : ExtensionOperand<16, [EnvVulkan]>;
|
||||
defm SPV_AMD_texture_gather_bias_lod : ExtensionOperand<17, [EnvVulkan]>;
|
||||
defm SPV_KHR_storage_buffer_storage_class : ExtensionOperand<18, [EnvVulkan]>;
|
||||
defm SPV_KHR_variable_pointers : ExtensionOperand<19, [EnvVulkan]>;
|
||||
defm SPV_AMD_gpu_shader_int16 : ExtensionOperand<20, [EnvVulkan]>;
|
||||
defm SPV_KHR_post_depth_coverage : ExtensionOperand<21, [EnvVulkan]>;
|
||||
defm SPV_KHR_shader_atomic_counter_ops : ExtensionOperand<22, []>;
|
||||
defm SPV_EXT_shader_stencil_export : ExtensionOperand<23, [EnvVulkan]>;
|
||||
defm SPV_EXT_shader_viewport_index_layer : ExtensionOperand<24, [EnvVulkan]>;
|
||||
defm SPV_AMD_shader_image_load_store_lod : ExtensionOperand<25, [EnvVulkan]>;
|
||||
defm SPV_AMD_shader_fragment_mask : ExtensionOperand<26, [EnvVulkan]>;
|
||||
defm SPV_EXT_fragment_fully_covered : ExtensionOperand<27, [EnvVulkan]>;
|
||||
defm SPV_AMD_gpu_shader_half_float_fetch : ExtensionOperand<28, [EnvVulkan]>;
|
||||
defm SPV_GOOGLE_decorate_string : ExtensionOperand<29, [EnvVulkan]>;
|
||||
defm SPV_GOOGLE_hlsl_functionality1 : ExtensionOperand<30, [EnvVulkan]>;
|
||||
defm SPV_NV_shader_subgroup_partitioned : ExtensionOperand<31, [EnvVulkan]>;
|
||||
defm SPV_EXT_descriptor_indexing : ExtensionOperand<32, [EnvVulkan]>;
|
||||
defm SPV_KHR_8bit_storage : ExtensionOperand<33, [EnvVulkan]>;
|
||||
defm SPV_KHR_vulkan_memory_model : ExtensionOperand<34, [EnvVulkan]>;
|
||||
defm SPV_NV_ray_tracing : ExtensionOperand<35, [EnvVulkan]>;
|
||||
defm SPV_NV_compute_shader_derivatives : ExtensionOperand<36, [EnvVulkan]>;
|
||||
defm SPV_NV_fragment_shader_barycentric : ExtensionOperand<37, [EnvVulkan]>;
|
||||
defm SPV_NV_mesh_shader : ExtensionOperand<38, [EnvVulkan]>;
|
||||
defm SPV_NV_shader_image_footprint : ExtensionOperand<39, [EnvVulkan]>;
|
||||
defm SPV_NV_shading_rate : ExtensionOperand<40, [EnvVulkan]>;
|
||||
defm SPV_INTEL_subgroups : ExtensionOperand<41, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_media_block_io : ExtensionOperand<42, [EnvOpenCL]>;
|
||||
defm SPV_EXT_fragment_invocation_density : ExtensionOperand<44, [EnvVulkan]>;
|
||||
defm SPV_KHR_no_integer_wrap_decoration : ExtensionOperand<45, [EnvOpenCL]>;
|
||||
defm SPV_KHR_float_controls : ExtensionOperand<46, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_EXT_physical_storage_buffer : ExtensionOperand<47, [EnvVulkan]>;
|
||||
defm SPV_INTEL_fpga_memory_attributes : ExtensionOperand<48, [EnvOpenCL]>;
|
||||
defm SPV_NV_cooperative_matrix : ExtensionOperand<49, [EnvVulkan]>;
|
||||
defm SPV_INTEL_shader_integer_functions2
|
||||
: ExtensionOperand<50, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_INTEL_fpga_loop_controls : ExtensionOperand<51, [EnvOpenCL]>;
|
||||
defm SPV_EXT_fragment_shader_interlock : ExtensionOperand<52, [EnvVulkan]>;
|
||||
defm SPV_NV_shader_sm_builtins : ExtensionOperand<53, [EnvVulkan]>;
|
||||
defm SPV_KHR_shader_clock : ExtensionOperand<54, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_INTEL_unstructured_loop_controls : ExtensionOperand<55, [EnvOpenCL]>;
|
||||
defm SPV_EXT_demote_to_helper_invocation : ExtensionOperand<56, [EnvVulkan]>;
|
||||
defm SPV_INTEL_fpga_reg : ExtensionOperand<57, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_blocking_pipes : ExtensionOperand<58, [EnvOpenCL]>;
|
||||
defm SPV_GOOGLE_user_type : ExtensionOperand<59, [EnvVulkan]>;
|
||||
defm SPV_KHR_physical_storage_buffer : ExtensionOperand<60, [EnvVulkan]>;
|
||||
defm SPV_INTEL_kernel_attributes : ExtensionOperand<61, [EnvOpenCL]>;
|
||||
defm SPV_KHR_non_semantic_info : ExtensionOperand<62, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_INTEL_io_pipes : ExtensionOperand<63, [EnvOpenCL]>;
|
||||
defm SPV_KHR_ray_tracing : ExtensionOperand<64, [EnvVulkan]>;
|
||||
defm SPV_KHR_ray_query : ExtensionOperand<65, [EnvVulkan]>;
|
||||
defm SPV_INTEL_fpga_memory_accesses : ExtensionOperand<66, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_arbitrary_precision_integers : ExtensionOperand<67, [EnvOpenCL]>;
|
||||
defm SPV_EXT_shader_atomic_float_add
|
||||
: ExtensionOperand<68, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_KHR_terminate_invocation : ExtensionOperand<69, [EnvVulkan]>;
|
||||
defm SPV_KHR_fragment_shading_rate : ExtensionOperand<70, [EnvVulkan]>;
|
||||
defm SPV_EXT_shader_image_int64 : ExtensionOperand<71, [EnvVulkan]>;
|
||||
defm SPV_INTEL_fp_fast_math_mode : ExtensionOperand<72, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_fpga_cluster_attributes : ExtensionOperand<73, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_loop_fuse : ExtensionOperand<74, [EnvOpenCL]>;
|
||||
defm SPV_EXT_shader_atomic_float_min_max
|
||||
: ExtensionOperand<75, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_KHR_workgroup_memory_explicit_layout
|
||||
: ExtensionOperand<76, [EnvVulkan]>;
|
||||
defm SPV_KHR_linkonce_odr : ExtensionOperand<77, [EnvOpenCL]>;
|
||||
defm SPV_KHR_expect_assume : ExtensionOperand<78, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_INTEL_fpga_dsp_control : ExtensionOperand<79, [EnvOpenCL]>;
|
||||
defm SPV_NV_bindless_texture : ExtensionOperand<80, [EnvVulkan]>;
|
||||
defm SPV_INTEL_fpga_invocation_pipelining_attributes
|
||||
: ExtensionOperand<81, [EnvOpenCL]>;
|
||||
defm SPV_KHR_subgroup_uniform_control_flow : ExtensionOperand<82, [EnvVulkan]>;
|
||||
defm SPV_HUAWEI_subpass_shading : ExtensionOperand<83, [EnvVulkan]>;
|
||||
defm SPV_KHR_integer_dot_product : ExtensionOperand<84, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_EXT_shader_atomic_float16_add
|
||||
: ExtensionOperand<85, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_INTEL_runtime_aligned : ExtensionOperand<86, [EnvOpenCL]>;
|
||||
defm SPV_KHR_bit_instructions : ExtensionOperand<87, [EnvOpenCL]>;
|
||||
defm SPV_NV_ray_tracing_motion_blur : ExtensionOperand<88, [EnvVulkan]>;
|
||||
defm SPV_KHR_uniform_group_instructions : ExtensionOperand<89, [EnvOpenCL]>;
|
||||
defm SPV_KHR_subgroup_rotate : ExtensionOperand<90, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_INTEL_split_barrier : ExtensionOperand<91, [EnvOpenCL]>;
|
||||
defm SPV_KHR_ray_cull_mask : ExtensionOperand<92, [EnvVulkan]>;
|
||||
defm SPV_KHR_fragment_shader_barycentric : ExtensionOperand<93, [EnvVulkan]>;
|
||||
defm SPV_EXT_relaxed_printf_string_address_space
|
||||
: ExtensionOperand<94, [EnvOpenCL]>;
|
||||
defm SPV_EXT_mesh_shader : ExtensionOperand<96, [EnvVulkan]>;
|
||||
defm SPV_ARM_core_builtins : ExtensionOperand<97, [EnvVulkan]>;
|
||||
defm SPV_EXT_opacity_micromap : ExtensionOperand<98, [EnvVulkan]>;
|
||||
defm SPV_NV_shader_invocation_reorder : ExtensionOperand<99, [EnvVulkan]>;
|
||||
defm SPV_INTEL_usm_storage_classes : ExtensionOperand<100, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_optnone : ExtensionOperand<103, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_function_pointers : ExtensionOperand<104, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_variable_length_array : ExtensionOperand<105, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_inline_assembly : ExtensionOperand<107, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_cache_controls : ExtensionOperand<108, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_global_variable_fpga_decorations
|
||||
: ExtensionOperand<110, [EnvOpenCL]>;
|
||||
defm SPV_KHR_cooperative_matrix : ExtensionOperand<111, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_EXT_arithmetic_fence : ExtensionOperand<112, [EnvOpenCL]>;
|
||||
defm SPV_EXT_optnone : ExtensionOperand<113, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_joint_matrix : ExtensionOperand<114, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_float_controls2 : ExtensionOperand<115, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_bindless_images : ExtensionOperand<116, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_long_composites : ExtensionOperand<117, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_memory_access_aliasing : ExtensionOperand<118, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_fp_max_error : ExtensionOperand<119, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_ternary_bitwise_function : ExtensionOperand<120, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_subgroup_matrix_multiply_accumulate
|
||||
: ExtensionOperand<121, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_2d_block_io : ExtensionOperand<122, [EnvOpenCL]>;
|
||||
defm SPV_INTEL_int4 : ExtensionOperand<123, [EnvOpenCL]>;
|
||||
defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>;
|
||||
defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Multiclass used to define Capabilities enum values and at the same time
|
||||
@ -342,7 +403,9 @@ class Capability<string name, bits<32> value> {
|
||||
|
||||
multiclass CapabilityOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def NAME : Capability<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<CapabilityOperand, value, NAME,
|
||||
minVersion, maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Matrix : CapabilityOperand<0, 0, 0, [], []>;
|
||||
@ -551,7 +614,8 @@ class SourceLanguage<string name, bits<32> value> {
|
||||
|
||||
multiclass SourceLanguageOperand<bits<32> value> {
|
||||
def : SourceLanguage<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0, 0, [], []>;
|
||||
defm : SymbolicOperandWithRequirements<SourceLanguageOperand, value, NAME, 0,
|
||||
0, [], [], []>;
|
||||
}
|
||||
|
||||
defm Unknown : SourceLanguageOperand<0>;
|
||||
@ -580,7 +644,8 @@ class AddressingModel<string name, bits<32> value> {
|
||||
|
||||
multiclass AddressingModelOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : AddressingModel<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<AddressingModelOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Logical : AddressingModelOperand<0, []>;
|
||||
@ -607,7 +672,8 @@ class ExecutionModel<string name, bits<32> value> {
|
||||
|
||||
multiclass ExecutionModelOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : ExecutionModel<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<ExecutionModelOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Vertex : ExecutionModelOperand<0, [Shader]>;
|
||||
@ -645,7 +711,8 @@ class MemoryModel<string name, bits<32> value> {
|
||||
|
||||
multiclass MemoryModelOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : MemoryModel<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<MemoryModelOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Simple : MemoryModelOperand<0, [Shader]>;
|
||||
@ -672,7 +739,8 @@ class ExecutionMode<string name, bits<32> value> {
|
||||
|
||||
multiclass ExecutionModeOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : ExecutionMode<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<ExecutionModeOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Invocations : ExecutionModeOperand<0, [Geometry]>;
|
||||
@ -748,7 +816,8 @@ class StorageClass<string name, bits<32> value> {
|
||||
|
||||
multiclass StorageClassOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : StorageClass<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<StorageClassOperand, value, NAME, 0, 0,
|
||||
reqExtensions, reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm UniformConstant : StorageClassOperand<0, [], []>;
|
||||
@ -794,7 +863,8 @@ class Dim<string name, bits<32> value> {
|
||||
|
||||
multiclass DimOperand<bits<32> value, string mnemonic, list<Capability> reqCapabilities> {
|
||||
def NAME : Dim<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<DimOperand, value, mnemonic, 0, 0, [],
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm DIM_1D : DimOperand<0, "1D", [Sampled1D, Image1D]>;
|
||||
@ -824,7 +894,8 @@ class SamplerAddressingMode<string name, bits<32> value> {
|
||||
|
||||
multiclass SamplerAddressingModeOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : SamplerAddressingMode<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<SamplerAddressingModeOperand, value,
|
||||
NAME, 0, 0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm None : SamplerAddressingModeOperand<0, [Kernel]>;
|
||||
@ -852,7 +923,8 @@ class SamplerFilterMode<string name, bits<32> value> {
|
||||
|
||||
multiclass SamplerFilterModeOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : SamplerFilterMode<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<SamplerFilterModeOperand, value, NAME,
|
||||
0, 0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Nearest : SamplerFilterModeOperand<0, [Kernel]>;
|
||||
@ -877,7 +949,8 @@ class ImageFormat<string name, bits<32> value> {
|
||||
|
||||
multiclass ImageFormatOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def NAME : ImageFormat<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<ImageFormatOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Unknown : ImageFormatOperand<0, []>;
|
||||
@ -940,7 +1013,8 @@ class ImageChannelOrder<string name, bits<32> value> {
|
||||
|
||||
multiclass ImageChannelOrderOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : ImageChannelOrder<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<ImageChannelOrderOperand, value, NAME,
|
||||
0, 0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm R : ImageChannelOrderOperand<0, [Kernel]>;
|
||||
@ -983,7 +1057,8 @@ class ImageChannelDataType<string name, bits<32> value> {
|
||||
|
||||
multiclass ImageChannelDataTypeOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : ImageChannelDataType<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<ImageChannelDataTypeOperand, value,
|
||||
NAME, 0, 0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm SnormInt8 : ImageChannelDataTypeOperand<0, []>;
|
||||
@ -1023,7 +1098,8 @@ class ImageOperand<string name, bits<32> value> {
|
||||
|
||||
multiclass ImageOperandOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : ImageOperand<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<ImageOperandOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm None : ImageOperandOperand<0x0, []>;
|
||||
@ -1061,7 +1137,8 @@ class FPFastMathMode<string name, bits<32> value> {
|
||||
|
||||
multiclass FPFastMathModeOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : FPFastMathMode<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<FPFastMathModeOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm None : FPFastMathModeOperand<0x0, []>;
|
||||
@ -1090,7 +1167,8 @@ class FPRoundingMode<string name, bits<32> value> {
|
||||
|
||||
multiclass FPRoundingModeOperand<bits<32> value> {
|
||||
def NAME : FPRoundingMode<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0, 0, [], []>;
|
||||
defm : SymbolicOperandWithRequirements<FPRoundingModeOperand, value, NAME, 0,
|
||||
0, [], [], []>;
|
||||
}
|
||||
|
||||
defm RTE : FPRoundingModeOperand<0>;
|
||||
@ -1117,7 +1195,8 @@ class LinkageType<string name, bits<32> value> {
|
||||
|
||||
multiclass LinkageTypeOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : LinkageType<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<LinkageTypeOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Export : LinkageTypeOperand<0, [Linkage]>;
|
||||
@ -1143,7 +1222,8 @@ class AccessQualifier<string name, bits<32> value> {
|
||||
|
||||
multiclass AccessQualifierOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def NAME : AccessQualifier<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<AccessQualifierOperand, value, NAME, 0,
|
||||
0, [], reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm ReadOnly : AccessQualifierOperand<0, [Kernel]>;
|
||||
@ -1170,7 +1250,9 @@ class FunctionParameterAttribute<string name, bits<32> value> {
|
||||
|
||||
multiclass FunctionParameterAttributeOperand<bits<32> value, list<Capability> reqCapabilities> {
|
||||
def : FunctionParameterAttribute<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand, value, NAME, 0, 0, [], reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<FunctionParameterAttributeOperand,
|
||||
value, NAME, 0, 0, [],
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Zext : FunctionParameterAttributeOperand<0, [Kernel]>;
|
||||
@ -1202,7 +1284,9 @@ class Decoration<string name, bits<32> value> {
|
||||
|
||||
multiclass DecorationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : Decoration<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<DecorationOperand, value, NAME,
|
||||
minVersion, maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm RelaxedPrecision : DecorationOperand<0, 0, 0, [], [Shader]>;
|
||||
@ -1303,7 +1387,9 @@ class BuiltIn<string name, bits<32> value> {
|
||||
|
||||
multiclass BuiltInOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def NAME : BuiltIn<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<BuiltInOperand, value, NAME,
|
||||
minVersion, maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Position : BuiltInOperand<0, 0, 0, [], [Shader]>;
|
||||
@ -1417,7 +1503,8 @@ class SelectionControl<string name, bits<32> value> {
|
||||
|
||||
multiclass SelectionControlOperand<bits<32> value> {
|
||||
def : SelectionControl<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME, 0, 0, [], []>;
|
||||
defm : SymbolicOperandWithRequirements<SelectionControlOperand, value, NAME,
|
||||
0, 0, [], [], []>;
|
||||
}
|
||||
|
||||
defm None : SelectionControlOperand<0x0>;
|
||||
@ -1443,7 +1530,8 @@ class LoopControl<string name, bits<32> value> {
|
||||
|
||||
multiclass LoopControlOperand<bits<32> value> {
|
||||
def : LoopControl<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0, 0, [], []>;
|
||||
defm : SymbolicOperandWithRequirements<LoopControlOperand, value, NAME, 0,
|
||||
0, [], [], []>;
|
||||
}
|
||||
|
||||
defm None : LoopControlOperand<0x0>;
|
||||
@ -1476,7 +1564,8 @@ class FunctionControl<string name, bits<32> value> {
|
||||
|
||||
multiclass FunctionControlOperand<bits<32> value> {
|
||||
def : FunctionControl<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0, 0, [], []>;
|
||||
defm : SymbolicOperandWithRequirements<FunctionControlOperand, value, NAME, 0,
|
||||
0, [], [], []>;
|
||||
}
|
||||
|
||||
defm None : FunctionControlOperand<0x0>;
|
||||
@ -1506,7 +1595,9 @@ class MemorySemantics<string name, bits<32> value> {
|
||||
|
||||
multiclass MemorySemanticsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : MemorySemantics<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<MemorySemanticsOperand, value, NAME,
|
||||
minVersion, maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm None : MemorySemanticsOperand<0x0, 0, 0, [], []>;
|
||||
@ -1544,7 +1635,9 @@ class MemoryOperand<string name, bits<32> value> {
|
||||
|
||||
multiclass MemoryOperandOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : MemoryOperand<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<MemoryOperandOperand, value, NAME,
|
||||
minVersion, maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm None : MemoryOperandOperand<0x0, 0, 0, [], []>;
|
||||
@ -1577,7 +1670,9 @@ class Scope<string name, bits<32> value> {
|
||||
|
||||
multiclass ScopeOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : Scope<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<ScopeOperand, value, NAME, minVersion,
|
||||
maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm CrossDevice : ScopeOperand<0, 0, 0, [], []>;
|
||||
@ -1607,7 +1702,9 @@ class GroupOperation<string name, bits<32> value> {
|
||||
|
||||
multiclass GroupOperationOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def NAME : GroupOperation<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<GroupOperationOperand, value, NAME,
|
||||
minVersion, maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm Reduce : GroupOperationOperand<0, 0, 0, [], [Kernel, GroupNonUniformArithmetic, GroupNonUniformBallot]>;
|
||||
@ -1638,7 +1735,9 @@ class KernelEnqueueFlags<string name, bits<32> value> {
|
||||
|
||||
multiclass KernelEnqueueFlagsOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : KernelEnqueueFlags<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<KernelEnqueueFlagsOperand, value, NAME,
|
||||
minVersion, maxVersion, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm NoWait : KernelEnqueueFlagsOperand<0, 0, 0, [], [Kernel]>;
|
||||
@ -1665,7 +1764,9 @@ class KernelProfilingInfo<string name, bits<32> value> {
|
||||
|
||||
multiclass KernelProfilingInfoOperand<bits<32> value, bits<32> minVersion, bits<32> maxVersion, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : KernelProfilingInfo<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value, NAME, minVersion, maxVersion, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<KernelProfilingInfoOperand, value,
|
||||
NAME, minVersion, maxVersion,
|
||||
reqExtensions, reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm None : KernelProfilingInfoOperand<0x0, 0, 0, [], []>;
|
||||
@ -1690,7 +1791,8 @@ class Opcode<string name, bits<32> value> {
|
||||
|
||||
multiclass OpcodeOperand<bits<32> value> {
|
||||
def : Opcode<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0, 0, [], []>;
|
||||
defm : SymbolicOperandWithRequirements<OpcodeOperand, value, NAME, 0,
|
||||
0, [], [], []>;
|
||||
}
|
||||
// TODO: implement other mnemonics.
|
||||
defm InBoundsAccessChain : OpcodeOperand<66>;
|
||||
@ -1720,7 +1822,9 @@ class CooperativeMatrixLayout<string name, bits<32> value> {
|
||||
|
||||
multiclass CooperativeMatrixLayoutOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : CooperativeMatrixLayout<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<CooperativeMatrixLayoutOperand, value,
|
||||
NAME, 0, 0, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm RowMajorKHR : CooperativeMatrixLayoutOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>;
|
||||
@ -1747,7 +1851,9 @@ class CooperativeMatrixOperands<string name, bits<32> value> {
|
||||
|
||||
multiclass CooperativeMatrixOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : CooperativeMatrixOperands<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<CooperativeMatrixOperandsOperand,
|
||||
value, NAME, 0, 0, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
defm NoneKHR : CooperativeMatrixOperandsOperand<0x0, [SPV_KHR_cooperative_matrix], [CooperativeMatrixKHR]>;
|
||||
@ -1780,7 +1886,9 @@ class SpecConstantOpOperands<string name, bits<32> value> {
|
||||
|
||||
multiclass SpecConstantOpOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
|
||||
def : SpecConstantOpOperands<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
|
||||
defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value,
|
||||
NAME, 0, 0, reqExtensions,
|
||||
reqCapabilities, []>;
|
||||
}
|
||||
|
||||
// Conversion
|
||||
@ -1868,7 +1976,9 @@ class MatrixMultiplyAccumulateOperands<string name, bits<32> value> {
|
||||
|
||||
multiclass MatrixMultiplyAccumulateOperandsOperand<bits<32> value, list<Extension> reqExtensions> {
|
||||
def : MatrixMultiplyAccumulateOperands<NAME, value>;
|
||||
defm : SymbolicOperandWithRequirements<MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0, reqExtensions, []>;
|
||||
defm : SymbolicOperandWithRequirements<
|
||||
MatrixMultiplyAccumulateOperandsOperand, value, NAME, 0, 0,
|
||||
reqExtensions, [], []>;
|
||||
}
|
||||
|
||||
defm None : MatrixMultiplyAccumulateOperandsOperand<0x0, [SPV_INTEL_subgroup_matrix_multiply_accumulate]>;
|
||||
|
||||
@ -28,6 +28,8 @@ namespace llvm {
|
||||
|
||||
bool useSoftFloat() const override;
|
||||
|
||||
bool softPromoteHalfType() const override { return true; }
|
||||
|
||||
/// computeKnownBitsForTargetNode - Determine which of the bits specified
|
||||
/// in Mask are known to be either zero or one and return them in the
|
||||
/// KnownZero/KnownOne bitsets.
|
||||
|
||||
@ -5486,14 +5486,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
|
||||
// Multiply and Add Packed Words
|
||||
// < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
|
||||
// < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
|
||||
// <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
|
||||
//
|
||||
// Multiply and Add Packed Signed and Unsigned Bytes
|
||||
// < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
|
||||
// <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
|
||||
// <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
|
||||
//
|
||||
// These intrinsics are auto-upgraded into non-masked forms:
|
||||
// < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128
|
||||
// (<8 x i16>, <8 x i16>, <4 x i32>, i8)
|
||||
// < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256
|
||||
// (<16 x i16>, <16 x i16>, <8 x i32>, i8)
|
||||
// <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512
|
||||
// (<32 x i16>, <32 x i16>, <16 x i32>, i16)
|
||||
// < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128
|
||||
// (<16 x i8>, <16 x i8>, <8 x i16>, i8)
|
||||
// <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256
|
||||
// (<32 x i8>, <32 x i8>, <16 x i16>, i16)
|
||||
// <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512
|
||||
// (<64 x i8>, <64 x i8>, <32 x i16>, i32)
|
||||
case Intrinsic::x86_sse2_pmadd_wd:
|
||||
case Intrinsic::x86_avx2_pmadd_wd:
|
||||
case Intrinsic::x86_avx512_pmaddw_d_512:
|
||||
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
|
||||
case Intrinsic::x86_avx2_pmadd_ub_sw:
|
||||
case Intrinsic::x86_avx512_pmaddubs_w_512:
|
||||
handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
|
||||
break;
|
||||
|
||||
|
||||
@ -565,6 +565,9 @@ struct ConstantComparesGatherer {
|
||||
/// Number of comparisons matched in the and/or chain
|
||||
unsigned UsedICmps = 0;
|
||||
|
||||
/// If the elements in Vals matches the comparisons
|
||||
bool IsEq = false;
|
||||
|
||||
/// Construct and compute the result for the comparison instruction Cond
|
||||
ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) {
|
||||
gather(Cond);
|
||||
@ -736,23 +739,23 @@ private:
|
||||
/// vector.
|
||||
/// One "Extra" case is allowed to differ from the other.
|
||||
void gather(Value *V) {
|
||||
bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value()));
|
||||
|
||||
Value *Op0, *Op1;
|
||||
if (match(V, m_LogicalOr(m_Value(Op0), m_Value(Op1))))
|
||||
IsEq = true;
|
||||
else if (match(V, m_LogicalAnd(m_Value(Op0), m_Value(Op1))))
|
||||
IsEq = false;
|
||||
else
|
||||
return;
|
||||
// Keep a stack (SmallVector for efficiency) for depth-first traversal
|
||||
SmallVector<Value *, 8> DFT;
|
||||
SmallPtrSet<Value *, 8> Visited;
|
||||
|
||||
// Initialize
|
||||
Visited.insert(V);
|
||||
DFT.push_back(V);
|
||||
SmallVector<Value *, 8> DFT{Op0, Op1};
|
||||
SmallPtrSet<Value *, 8> Visited{V, Op0, Op1};
|
||||
|
||||
while (!DFT.empty()) {
|
||||
V = DFT.pop_back_val();
|
||||
|
||||
if (Instruction *I = dyn_cast<Instruction>(V)) {
|
||||
// If it is a || (or && depending on isEQ), process the operands.
|
||||
Value *Op0, *Op1;
|
||||
if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
|
||||
if (IsEq ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
|
||||
: match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
|
||||
if (Visited.insert(Op1).second)
|
||||
DFT.push_back(Op1);
|
||||
@ -763,7 +766,7 @@ private:
|
||||
}
|
||||
|
||||
// Try to match the current instruction
|
||||
if (matchInstruction(I, isEQ))
|
||||
if (matchInstruction(I, IsEq))
|
||||
// Match succeed, continue the loop
|
||||
continue;
|
||||
}
|
||||
@ -5103,6 +5106,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
|
||||
Value *CompVal = ConstantCompare.CompValue;
|
||||
unsigned UsedICmps = ConstantCompare.UsedICmps;
|
||||
Value *ExtraCase = ConstantCompare.Extra;
|
||||
bool TrueWhenEqual = ConstantCompare.IsEq;
|
||||
|
||||
// If we didn't have a multiply compared value, fail.
|
||||
if (!CompVal)
|
||||
@ -5112,8 +5116,6 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
|
||||
if (UsedICmps <= 1)
|
||||
return false;
|
||||
|
||||
bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value()));
|
||||
|
||||
// There might be duplicate constants in the list, which the switch
|
||||
// instruction can't handle, remove them now.
|
||||
array_pod_sort(Values.begin(), Values.end(), constantIntSortPredicate);
|
||||
|
||||
@ -793,267 +793,30 @@ static bool canWidenCallReturnType(Type *Ty) {
|
||||
}
|
||||
|
||||
bool LoopVectorizationLegality::canVectorizeInstrs() {
|
||||
BasicBlock *Header = TheLoop->getHeader();
|
||||
bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
|
||||
bool Result = true;
|
||||
|
||||
// For each block in the loop.
|
||||
for (BasicBlock *BB : TheLoop->blocks()) {
|
||||
// Scan the instructions in the block and look for hazards.
|
||||
for (Instruction &I : *BB) {
|
||||
if (auto *Phi = dyn_cast<PHINode>(&I)) {
|
||||
Type *PhiTy = Phi->getType();
|
||||
// Check that this PHI type is allowed.
|
||||
if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
|
||||
!PhiTy->isPointerTy()) {
|
||||
reportVectorizationFailure("Found a non-int non-pointer PHI",
|
||||
"loop control flow is not understood by vectorizer",
|
||||
"CFGNotUnderstood", ORE, TheLoop);
|
||||
return false;
|
||||
}
|
||||
|
||||
// If this PHINode is not in the header block, then we know that we
|
||||
// can convert it to select during if-conversion. No need to check if
|
||||
// the PHIs in this block are induction or reduction variables.
|
||||
if (BB != Header) {
|
||||
// Non-header phi nodes that have outside uses can be vectorized. Add
|
||||
// them to the list of allowed exits.
|
||||
// Unsafe cyclic dependencies with header phis are identified during
|
||||
// legalization for reduction, induction and fixed order
|
||||
// recurrences.
|
||||
AllowedExit.insert(&I);
|
||||
continue;
|
||||
}
|
||||
|
||||
// We only allow if-converted PHIs with exactly two incoming values.
|
||||
if (Phi->getNumIncomingValues() != 2) {
|
||||
reportVectorizationFailure("Found an invalid PHI",
|
||||
"loop control flow is not understood by vectorizer",
|
||||
"CFGNotUnderstood", ORE, TheLoop, Phi);
|
||||
return false;
|
||||
}
|
||||
|
||||
RecurrenceDescriptor RedDes;
|
||||
if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
|
||||
DT, PSE.getSE())) {
|
||||
Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
|
||||
AllowedExit.insert(RedDes.getLoopExitInstr());
|
||||
Reductions[Phi] = RedDes;
|
||||
continue;
|
||||
}
|
||||
|
||||
// We prevent matching non-constant strided pointer IVS to preserve
|
||||
// historical vectorizer behavior after a generalization of the
|
||||
// IVDescriptor code. The intent is to remove this check, but we
|
||||
// have to fix issues around code quality for such loops first.
|
||||
auto IsDisallowedStridedPointerInduction =
|
||||
[](const InductionDescriptor &ID) {
|
||||
if (AllowStridedPointerIVs)
|
||||
return false;
|
||||
return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
|
||||
ID.getConstIntStepValue() == nullptr;
|
||||
};
|
||||
|
||||
// TODO: Instead of recording the AllowedExit, it would be good to
|
||||
// record the complementary set: NotAllowedExit. These include (but may
|
||||
// not be limited to):
|
||||
// 1. Reduction phis as they represent the one-before-last value, which
|
||||
// is not available when vectorized
|
||||
// 2. Induction phis and increment when SCEV predicates cannot be used
|
||||
// outside the loop - see addInductionPhi
|
||||
// 3. Non-Phis with outside uses when SCEV predicates cannot be used
|
||||
// outside the loop - see call to hasOutsideLoopUser in the non-phi
|
||||
// handling below
|
||||
// 4. FixedOrderRecurrence phis that can possibly be handled by
|
||||
// extraction.
|
||||
// By recording these, we can then reason about ways to vectorize each
|
||||
// of these NotAllowedExit.
|
||||
InductionDescriptor ID;
|
||||
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
|
||||
!IsDisallowedStridedPointerInduction(ID)) {
|
||||
addInductionPhi(Phi, ID, AllowedExit);
|
||||
Requirements->addExactFPMathInst(ID.getExactFPMathInst());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
|
||||
AllowedExit.insert(Phi);
|
||||
FixedOrderRecurrences.insert(Phi);
|
||||
continue;
|
||||
}
|
||||
|
||||
// As a last resort, coerce the PHI to a AddRec expression
|
||||
// and re-try classifying it a an induction PHI.
|
||||
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
|
||||
!IsDisallowedStridedPointerInduction(ID)) {
|
||||
addInductionPhi(Phi, ID, AllowedExit);
|
||||
continue;
|
||||
}
|
||||
|
||||
reportVectorizationFailure("Found an unidentified PHI",
|
||||
"value that could not be identified as "
|
||||
"reduction is used outside the loop",
|
||||
"NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
|
||||
Result &= canVectorizeInstr(I);
|
||||
if (!DoExtraAnalysis && !Result)
|
||||
return false;
|
||||
} // end of PHI handling
|
||||
|
||||
// We handle calls that:
|
||||
// * Have a mapping to an IR intrinsic.
|
||||
// * Have a vector version available.
|
||||
auto *CI = dyn_cast<CallInst>(&I);
|
||||
|
||||
if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
|
||||
!(CI->getCalledFunction() && TLI &&
|
||||
(!VFDatabase::getMappings(*CI).empty() ||
|
||||
isTLIScalarize(*TLI, *CI)))) {
|
||||
// If the call is a recognized math libary call, it is likely that
|
||||
// we can vectorize it given loosened floating-point constraints.
|
||||
LibFunc Func;
|
||||
bool IsMathLibCall =
|
||||
TLI && CI->getCalledFunction() &&
|
||||
CI->getType()->isFloatingPointTy() &&
|
||||
TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
|
||||
TLI->hasOptimizedCodeGen(Func);
|
||||
|
||||
if (IsMathLibCall) {
|
||||
// TODO: Ideally, we should not use clang-specific language here,
|
||||
// but it's hard to provide meaningful yet generic advice.
|
||||
// Also, should this be guarded by allowExtraAnalysis() and/or be part
|
||||
// of the returned info from isFunctionVectorizable()?
|
||||
reportVectorizationFailure(
|
||||
"Found a non-intrinsic callsite",
|
||||
"library call cannot be vectorized. "
|
||||
"Try compiling with -fno-math-errno, -ffast-math, "
|
||||
"or similar flags",
|
||||
"CantVectorizeLibcall", ORE, TheLoop, CI);
|
||||
} else {
|
||||
reportVectorizationFailure("Found a non-intrinsic callsite",
|
||||
"call instruction cannot be vectorized",
|
||||
"CantVectorizeLibcall", ORE, TheLoop, CI);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Some intrinsics have scalar arguments and should be same in order for
|
||||
// them to be vectorized (i.e. loop invariant).
|
||||
if (CI) {
|
||||
auto *SE = PSE.getSE();
|
||||
Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
|
||||
for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
|
||||
if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
|
||||
if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
|
||||
TheLoop)) {
|
||||
reportVectorizationFailure("Found unvectorizable intrinsic",
|
||||
"intrinsic instruction cannot be vectorized",
|
||||
"CantVectorizeIntrinsic", ORE, TheLoop, CI);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we found a vectorized variant of a function, note that so LV can
|
||||
// make better decisions about maximum VF.
|
||||
if (CI && !VFDatabase::getMappings(*CI).empty())
|
||||
VecCallVariantsFound = true;
|
||||
|
||||
auto CanWidenInstructionTy = [](Instruction const &Inst) {
|
||||
Type *InstTy = Inst.getType();
|
||||
if (!isa<StructType>(InstTy))
|
||||
return canVectorizeTy(InstTy);
|
||||
|
||||
// For now, we only recognize struct values returned from calls where
|
||||
// all users are extractvalue as vectorizable. All element types of the
|
||||
// struct must be types that can be widened.
|
||||
return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
|
||||
all_of(Inst.users(), IsaPred<ExtractValueInst>);
|
||||
};
|
||||
|
||||
// Check that the instruction return type is vectorizable.
|
||||
// We can't vectorize casts from vector type to scalar type.
|
||||
// Also, we can't vectorize extractelement instructions.
|
||||
if (!CanWidenInstructionTy(I) ||
|
||||
(isa<CastInst>(I) &&
|
||||
!VectorType::isValidElementType(I.getOperand(0)->getType())) ||
|
||||
isa<ExtractElementInst>(I)) {
|
||||
reportVectorizationFailure("Found unvectorizable type",
|
||||
"instruction return type cannot be vectorized",
|
||||
"CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that the stored type is vectorizable.
|
||||
if (auto *ST = dyn_cast<StoreInst>(&I)) {
|
||||
Type *T = ST->getValueOperand()->getType();
|
||||
if (!VectorType::isValidElementType(T)) {
|
||||
reportVectorizationFailure("Store instruction cannot be vectorized",
|
||||
"CantVectorizeStore", ORE, TheLoop, ST);
|
||||
return false;
|
||||
}
|
||||
|
||||
// For nontemporal stores, check that a nontemporal vector version is
|
||||
// supported on the target.
|
||||
if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
|
||||
// Arbitrarily try a vector of 2 elements.
|
||||
auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
|
||||
assert(VecTy && "did not find vectorized version of stored type");
|
||||
if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
|
||||
reportVectorizationFailure(
|
||||
"nontemporal store instruction cannot be vectorized",
|
||||
"CantVectorizeNontemporalStore", ORE, TheLoop, ST);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (auto *LD = dyn_cast<LoadInst>(&I)) {
|
||||
if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
|
||||
// For nontemporal loads, check that a nontemporal vector version is
|
||||
// supported on the target (arbitrarily try a vector of 2 elements).
|
||||
auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
|
||||
assert(VecTy && "did not find vectorized version of load type");
|
||||
if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
|
||||
reportVectorizationFailure(
|
||||
"nontemporal load instruction cannot be vectorized",
|
||||
"CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// FP instructions can allow unsafe algebra, thus vectorizable by
|
||||
// non-IEEE-754 compliant SIMD units.
|
||||
// This applies to floating-point math operations and calls, not memory
|
||||
// operations, shuffles, or casts, as they don't change precision or
|
||||
// semantics.
|
||||
} else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
|
||||
!I.isFast()) {
|
||||
LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
|
||||
Hints->setPotentiallyUnsafe();
|
||||
}
|
||||
|
||||
// Reduction instructions are allowed to have exit users.
|
||||
// All other instructions must not have external users.
|
||||
if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
|
||||
// We can safely vectorize loops where instructions within the loop are
|
||||
// used outside the loop only if the SCEV predicates within the loop is
|
||||
// same as outside the loop. Allowing the exit means reusing the SCEV
|
||||
// outside the loop.
|
||||
if (PSE.getPredicate().isAlwaysTrue()) {
|
||||
AllowedExit.insert(&I);
|
||||
continue;
|
||||
}
|
||||
reportVectorizationFailure("Value cannot be used outside the loop",
|
||||
"ValueUsedOutsideLoop", ORE, TheLoop, &I);
|
||||
return false;
|
||||
}
|
||||
} // next instr.
|
||||
}
|
||||
}
|
||||
|
||||
if (!PrimaryInduction) {
|
||||
if (Inductions.empty()) {
|
||||
reportVectorizationFailure("Did not find one integer induction var",
|
||||
reportVectorizationFailure(
|
||||
"Did not find one integer induction var",
|
||||
"loop induction variable could not be identified",
|
||||
"NoInductionVariable", ORE, TheLoop);
|
||||
return false;
|
||||
}
|
||||
if (!WidestIndTy) {
|
||||
reportVectorizationFailure("Did not find one integer induction var",
|
||||
reportVectorizationFailure(
|
||||
"Did not find one integer induction var",
|
||||
"integer loop induction variable could not be identified",
|
||||
"NoIntegerInductionVariable", ORE, TheLoop);
|
||||
return false;
|
||||
@ -1067,6 +830,259 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
|
||||
if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
|
||||
PrimaryInduction = nullptr;
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
|
||||
BasicBlock *BB = I.getParent();
|
||||
BasicBlock *Header = TheLoop->getHeader();
|
||||
|
||||
if (auto *Phi = dyn_cast<PHINode>(&I)) {
|
||||
Type *PhiTy = Phi->getType();
|
||||
// Check that this PHI type is allowed.
|
||||
if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
|
||||
!PhiTy->isPointerTy()) {
|
||||
reportVectorizationFailure(
|
||||
"Found a non-int non-pointer PHI",
|
||||
"loop control flow is not understood by vectorizer",
|
||||
"CFGNotUnderstood", ORE, TheLoop);
|
||||
return false;
|
||||
}
|
||||
|
||||
// If this PHINode is not in the header block, then we know that we
|
||||
// can convert it to select during if-conversion. No need to check if
|
||||
// the PHIs in this block are induction or reduction variables.
|
||||
if (BB != Header) {
|
||||
// Non-header phi nodes that have outside uses can be vectorized. Add
|
||||
// them to the list of allowed exits.
|
||||
// Unsafe cyclic dependencies with header phis are identified during
|
||||
// legalization for reduction, induction and fixed order
|
||||
// recurrences.
|
||||
AllowedExit.insert(&I);
|
||||
return true;
|
||||
}
|
||||
|
||||
// We only allow if-converted PHIs with exactly two incoming values.
|
||||
if (Phi->getNumIncomingValues() != 2) {
|
||||
reportVectorizationFailure(
|
||||
"Found an invalid PHI",
|
||||
"loop control flow is not understood by vectorizer",
|
||||
"CFGNotUnderstood", ORE, TheLoop, Phi);
|
||||
return false;
|
||||
}
|
||||
|
||||
RecurrenceDescriptor RedDes;
|
||||
if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
|
||||
PSE.getSE())) {
|
||||
Requirements->addExactFPMathInst(RedDes.getExactFPMathInst());
|
||||
AllowedExit.insert(RedDes.getLoopExitInstr());
|
||||
Reductions[Phi] = RedDes;
|
||||
return true;
|
||||
}
|
||||
|
||||
// We prevent matching non-constant strided pointer IVS to preserve
|
||||
// historical vectorizer behavior after a generalization of the
|
||||
// IVDescriptor code. The intent is to remove this check, but we
|
||||
// have to fix issues around code quality for such loops first.
|
||||
auto IsDisallowedStridedPointerInduction =
|
||||
[](const InductionDescriptor &ID) {
|
||||
if (AllowStridedPointerIVs)
|
||||
return false;
|
||||
return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
|
||||
ID.getConstIntStepValue() == nullptr;
|
||||
};
|
||||
|
||||
// TODO: Instead of recording the AllowedExit, it would be good to
|
||||
// record the complementary set: NotAllowedExit. These include (but may
|
||||
// not be limited to):
|
||||
// 1. Reduction phis as they represent the one-before-last value, which
|
||||
// is not available when vectorized
|
||||
// 2. Induction phis and increment when SCEV predicates cannot be used
|
||||
// outside the loop - see addInductionPhi
|
||||
// 3. Non-Phis with outside uses when SCEV predicates cannot be used
|
||||
// outside the loop - see call to hasOutsideLoopUser in the non-phi
|
||||
// handling below
|
||||
// 4. FixedOrderRecurrence phis that can possibly be handled by
|
||||
// extraction.
|
||||
// By recording these, we can then reason about ways to vectorize each
|
||||
// of these NotAllowedExit.
|
||||
InductionDescriptor ID;
|
||||
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID) &&
|
||||
!IsDisallowedStridedPointerInduction(ID)) {
|
||||
addInductionPhi(Phi, ID, AllowedExit);
|
||||
Requirements->addExactFPMathInst(ID.getExactFPMathInst());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
|
||||
AllowedExit.insert(Phi);
|
||||
FixedOrderRecurrences.insert(Phi);
|
||||
return true;
|
||||
}
|
||||
|
||||
// As a last resort, coerce the PHI to a AddRec expression
|
||||
// and re-try classifying it a an induction PHI.
|
||||
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
|
||||
!IsDisallowedStridedPointerInduction(ID)) {
|
||||
addInductionPhi(Phi, ID, AllowedExit);
|
||||
return true;
|
||||
}
|
||||
|
||||
reportVectorizationFailure("Found an unidentified PHI",
|
||||
"value that could not be identified as "
|
||||
"reduction is used outside the loop",
|
||||
"NonReductionValueUsedOutsideLoop", ORE, TheLoop,
|
||||
Phi);
|
||||
return false;
|
||||
} // end of PHI handling
|
||||
|
||||
// We handle calls that:
|
||||
// * Have a mapping to an IR intrinsic.
|
||||
// * Have a vector version available.
|
||||
auto *CI = dyn_cast<CallInst>(&I);
|
||||
|
||||
if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
|
||||
!(CI->getCalledFunction() && TLI &&
|
||||
(!VFDatabase::getMappings(*CI).empty() || isTLIScalarize(*TLI, *CI)))) {
|
||||
// If the call is a recognized math libary call, it is likely that
|
||||
// we can vectorize it given loosened floating-point constraints.
|
||||
LibFunc Func;
|
||||
bool IsMathLibCall =
|
||||
TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() &&
|
||||
TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
|
||||
TLI->hasOptimizedCodeGen(Func);
|
||||
|
||||
if (IsMathLibCall) {
|
||||
// TODO: Ideally, we should not use clang-specific language here,
|
||||
// but it's hard to provide meaningful yet generic advice.
|
||||
// Also, should this be guarded by allowExtraAnalysis() and/or be part
|
||||
// of the returned info from isFunctionVectorizable()?
|
||||
reportVectorizationFailure(
|
||||
"Found a non-intrinsic callsite",
|
||||
"library call cannot be vectorized. "
|
||||
"Try compiling with -fno-math-errno, -ffast-math, "
|
||||
"or similar flags",
|
||||
"CantVectorizeLibcall", ORE, TheLoop, CI);
|
||||
} else {
|
||||
reportVectorizationFailure("Found a non-intrinsic callsite",
|
||||
"call instruction cannot be vectorized",
|
||||
"CantVectorizeLibcall", ORE, TheLoop, CI);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Some intrinsics have scalar arguments and should be same in order for
|
||||
// them to be vectorized (i.e. loop invariant).
|
||||
if (CI) {
|
||||
auto *SE = PSE.getSE();
|
||||
Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
|
||||
for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
|
||||
if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
|
||||
if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)), TheLoop)) {
|
||||
reportVectorizationFailure(
|
||||
"Found unvectorizable intrinsic",
|
||||
"intrinsic instruction cannot be vectorized",
|
||||
"CantVectorizeIntrinsic", ORE, TheLoop, CI);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we found a vectorized variant of a function, note that so LV can
|
||||
// make better decisions about maximum VF.
|
||||
if (CI && !VFDatabase::getMappings(*CI).empty())
|
||||
VecCallVariantsFound = true;
|
||||
|
||||
auto CanWidenInstructionTy = [](Instruction const &Inst) {
|
||||
Type *InstTy = Inst.getType();
|
||||
if (!isa<StructType>(InstTy))
|
||||
return canVectorizeTy(InstTy);
|
||||
|
||||
// For now, we only recognize struct values returned from calls where
|
||||
// all users are extractvalue as vectorizable. All element types of the
|
||||
// struct must be types that can be widened.
|
||||
return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
|
||||
all_of(Inst.users(), IsaPred<ExtractValueInst>);
|
||||
};
|
||||
|
||||
// Check that the instruction return type is vectorizable.
|
||||
// We can't vectorize casts from vector type to scalar type.
|
||||
// Also, we can't vectorize extractelement instructions.
|
||||
if (!CanWidenInstructionTy(I) ||
|
||||
(isa<CastInst>(I) &&
|
||||
!VectorType::isValidElementType(I.getOperand(0)->getType())) ||
|
||||
isa<ExtractElementInst>(I)) {
|
||||
reportVectorizationFailure("Found unvectorizable type",
|
||||
"instruction return type cannot be vectorized",
|
||||
"CantVectorizeInstructionReturnType", ORE,
|
||||
TheLoop, &I);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that the stored type is vectorizable.
|
||||
if (auto *ST = dyn_cast<StoreInst>(&I)) {
|
||||
Type *T = ST->getValueOperand()->getType();
|
||||
if (!VectorType::isValidElementType(T)) {
|
||||
reportVectorizationFailure("Store instruction cannot be vectorized",
|
||||
"CantVectorizeStore", ORE, TheLoop, ST);
|
||||
return false;
|
||||
}
|
||||
|
||||
// For nontemporal stores, check that a nontemporal vector version is
|
||||
// supported on the target.
|
||||
if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
|
||||
// Arbitrarily try a vector of 2 elements.
|
||||
auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
|
||||
assert(VecTy && "did not find vectorized version of stored type");
|
||||
if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
|
||||
reportVectorizationFailure(
|
||||
"nontemporal store instruction cannot be vectorized",
|
||||
"CantVectorizeNontemporalStore", ORE, TheLoop, ST);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (auto *LD = dyn_cast<LoadInst>(&I)) {
|
||||
if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
|
||||
// For nontemporal loads, check that a nontemporal vector version is
|
||||
// supported on the target (arbitrarily try a vector of 2 elements).
|
||||
auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
|
||||
assert(VecTy && "did not find vectorized version of load type");
|
||||
if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
|
||||
reportVectorizationFailure(
|
||||
"nontemporal load instruction cannot be vectorized",
|
||||
"CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// FP instructions can allow unsafe algebra, thus vectorizable by
|
||||
// non-IEEE-754 compliant SIMD units.
|
||||
// This applies to floating-point math operations and calls, not memory
|
||||
// operations, shuffles, or casts, as they don't change precision or
|
||||
// semantics.
|
||||
} else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
|
||||
!I.isFast()) {
|
||||
LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
|
||||
Hints->setPotentiallyUnsafe();
|
||||
}
|
||||
|
||||
// Reduction instructions are allowed to have exit users.
|
||||
// All other instructions must not have external users.
|
||||
if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
|
||||
// We can safely vectorize loops where instructions within the loop are
|
||||
// used outside the loop only if the SCEV predicates within the loop is
|
||||
// same as outside the loop. Allowing the exit means reusing the SCEV
|
||||
// outside the loop.
|
||||
if (PSE.getPredicate().isAlwaysTrue()) {
|
||||
AllowedExit.insert(&I);
|
||||
return true;
|
||||
}
|
||||
reportVectorizationFailure("Value cannot be used outside the loop",
|
||||
"ValueUsedOutsideLoop", ORE, TheLoop, &I);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -256,13 +256,15 @@ public:
|
||||
new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
|
||||
GEPNoWrapFlags::none(), DL, Name));
|
||||
}
|
||||
VPInstruction *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset,
|
||||
DebugLoc DL = DebugLoc::getUnknown(),
|
||||
const Twine &Name = "") {
|
||||
return tryInsertInstruction(
|
||||
new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
|
||||
GEPNoWrapFlags::inBounds(), DL, Name));
|
||||
|
||||
VPInstruction *createNoWrapPtrAdd(VPValue *Ptr, VPValue *Offset,
|
||||
GEPNoWrapFlags GEPFlags,
|
||||
DebugLoc DL = DebugLoc::getUnknown(),
|
||||
const Twine &Name = "") {
|
||||
return tryInsertInstruction(new VPInstruction(
|
||||
VPInstruction::PtrAdd, {Ptr, Offset}, GEPFlags, DL, Name));
|
||||
}
|
||||
|
||||
VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
|
||||
DebugLoc DL = DebugLoc::getUnknown(),
|
||||
const Twine &Name = "") {
|
||||
|
||||
@ -2615,9 +2615,7 @@ void VPlanTransforms::createInterleaveGroups(
|
||||
VPValue *OffsetVPV =
|
||||
Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
|
||||
VPBuilder B(InsertPos);
|
||||
Addr = NW.isInBounds()
|
||||
? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
|
||||
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
|
||||
Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
|
||||
}
|
||||
// If the group is reverse, adjust the index to refer to the last vector
|
||||
// lane instead of the first. We adjust the index from the first vector
|
||||
|
||||
@ -1812,6 +1812,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
|
||||
// erased in the correct order.
|
||||
Worklist.push(LI);
|
||||
|
||||
Type *ElemType = VecTy->getElementType();
|
||||
|
||||
// Replace extracts with narrow scalar loads.
|
||||
for (User *U : LI->users()) {
|
||||
auto *EI = cast<ExtractElementInst>(U);
|
||||
@ -1825,13 +1827,19 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
|
||||
Builder.SetInsertPoint(EI);
|
||||
Value *GEP =
|
||||
Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
|
||||
auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
|
||||
VecTy->getElementType(), GEP, EI->getName() + ".scalar"));
|
||||
auto *NewLoad = cast<LoadInst>(
|
||||
Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
|
||||
|
||||
Align ScalarOpAlignment = computeAlignmentAfterScalarization(
|
||||
LI->getAlign(), VecTy->getElementType(), Idx, *DL);
|
||||
Align ScalarOpAlignment =
|
||||
computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
|
||||
NewLoad->setAlignment(ScalarOpAlignment);
|
||||
|
||||
if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
|
||||
size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
|
||||
AAMDNodes OldAAMD = LI->getAAMetadata();
|
||||
NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
|
||||
}
|
||||
|
||||
replaceValue(*EI, *NewLoad, false);
|
||||
}
|
||||
|
||||
|
||||
198
llvm/test/Analysis/DemandedBits/ashr.ll
Normal file
198
llvm/test/Analysis/DemandedBits/ashr.ll
Normal file
@ -0,0 +1,198 @@
|
||||
; RUN: opt -S -disable-output -passes="print<demanded-bits>" < %s 2>&1 | FileCheck %s
|
||||
|
||||
define i8 @test_ashr_const_amount_4(i32 %a) {
|
||||
; CHECK-LABEL: 'test_ashr_const_amount_4'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 4
|
||||
; CHECK-DAG: DemandedBits: 0xff0 for %a in %ashr = ashr i32 %a, 4
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 4 in %ashr = ashr i32 %a, 4
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8
|
||||
;
|
||||
%ashr = ashr i32 %a, 4
|
||||
%ashr.t = trunc i32 %ashr to i8
|
||||
ret i8 %ashr.t
|
||||
}
|
||||
|
||||
define i8 @test_ashr_const_amount_5(i32 %a) {
|
||||
; CHECK-LABEL: 'test_ashr_const_amount_5'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 5
|
||||
; CHECK-DAG: DemandedBits: 0x1fe0 for %a in %ashr = ashr i32 %a, 5
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 5 in %ashr = ashr i32 %a, 5
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8
|
||||
;
|
||||
%ashr = ashr i32 %a, 5
|
||||
%ashr.t = trunc i32 %ashr to i8
|
||||
ret i8 %ashr.t
|
||||
}
|
||||
|
||||
define i8 @test_ashr_const_amount_8(i32 %a) {
|
||||
; CHECK-LABEL: 'test_ashr_const_amount_8'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 8
|
||||
; CHECK-DAG: DemandedBits: 0xff00 for %a in %ashr = ashr i32 %a, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %ashr = ashr i32 %a, 8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8
|
||||
;
|
||||
%ashr = ashr i32 %a, 8
|
||||
%ashr.t = trunc i32 %ashr to i8
|
||||
ret i8 %ashr.t
|
||||
}
|
||||
|
||||
define i8 @test_ashr_const_amount_9(i32 %a) {
|
||||
|
||||
; CHECK-LABEL: 'test_ashr_const_amount_9'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, 8
|
||||
; CHECK-DAG: DemandedBits: 0xff00 for %a in %ashr = ashr i32 %a, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %ashr = ashr i32 %a, 8
|
||||
;
|
||||
%ashr = ashr i32 %a, 8
|
||||
%ashr.t = trunc i32 %ashr to i8
|
||||
ret i8 %ashr.t
|
||||
}
|
||||
|
||||
define i8 @test_ashr(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_ashr'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr in %ashr.t = trunc i32 %ashr to i8
|
||||
;
|
||||
%ashr = ashr i32 %a, %b
|
||||
%ashr.t = trunc i32 %ashr to i8
|
||||
ret i8 %ashr.t
|
||||
}
|
||||
|
||||
define i8 @test_ashr_range_1(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_ashr_range_1'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr in %shl.t = trunc i32 %ashr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xff for %ashr = ashr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0x7ff for %a in %ashr = ashr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %ashr = ashr i32 %a, %b2
|
||||
;
|
||||
%b2 = and i32 %b, 3
|
||||
%ashr = ashr i32 %a, %b2
|
||||
%shl.t = trunc i32 %ashr to i8
|
||||
ret i8 %shl.t
|
||||
}
|
||||
|
||||
define i32 @test_ashr_range_2(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_ashr_range_2'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %ashr = ashr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %ashr = ashr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %ashr = ashr i32 %a, %b2
|
||||
;
|
||||
%b2 = and i32 %b, 3
|
||||
%ashr = ashr i32 %a, %b2
|
||||
ret i32 %ashr
|
||||
}
|
||||
|
||||
define i32 @test_ashr_range_3(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_ashr_range_3'
|
||||
; CHECK-DAG: DemandedBits: 0xffff for %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shl = shl i32 %ashr, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff for %ashr in %shl = shl i32 %ashr, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shl = shl i32 %ashr, 16
|
||||
;
|
||||
%ashr = ashr i32 %a, %b
|
||||
%shl = shl i32 %ashr, 16
|
||||
ret i32 %shl
|
||||
}
|
||||
define i32 @test_ashr_range_4(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_ashr_range_4'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shr = lshr i32 %ashr, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %ashr in %shr = lshr i32 %ashr, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %shr = lshr i32 %ashr, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %a in %ashr = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %ashr = ashr i32 %a, %b
|
||||
%ashr = ashr i32 %a, %b
|
||||
%shr = lshr i32 %ashr, 8
|
||||
ret i32 %shr
|
||||
}
|
||||
|
||||
define i32 @test_ashr_range_5(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_ashr_range_5'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %2 = and i32 %1, 255
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = and i32 %1, 255
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 255 in %2 = and i32 %1, 255
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = ashr i32 %a, %b
|
||||
;
|
||||
%1 = ashr i32 %a, %b
|
||||
%2 = and i32 %1, 255
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
define i32 @test_ashr_range_6(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_ashr_range_6'
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %a in %lshr.1 = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr.1 = ashr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %lshr.2 = ashr i32 %lshr.1, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 in %lshr.2 = ashr i32 %lshr.1, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %lshr.2 = ashr i32 %lshr.1, 16
|
||||
;
|
||||
%lshr.1 = ashr i32 %a, %b
|
||||
%lshr.2 = ashr i32 %lshr.1, 16
|
||||
ret i32 %lshr.2
|
||||
}
|
||||
|
||||
define i8 @test_ashr_var_amount(i32 %a, i32 %b){
|
||||
; CHECK-LABEL: 'test_ashr_var_amount'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 = ashr i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = ashr i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = ashr i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8
|
||||
;
|
||||
%1 = add nsw i32 %a, %b
|
||||
%2 = trunc i32 %1 to i8
|
||||
%3 = zext i8 %2 to i32
|
||||
%4 = ashr i32 %1, %3
|
||||
%5 = trunc i32 %4 to i8
|
||||
ret i8 %5
|
||||
}
|
||||
|
||||
define i8 @test_ashr_var_amount_nsw(i32 %a, i32 %b){
|
||||
; CHECK-LABEL 'test_ashr_var_amount_nsw'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 = ashr exact i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = ashr exact i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = ashr exact i32 %1, %3
|
||||
;
|
||||
%1 = add nsw i32 %a, %b
|
||||
%2 = trunc i32 %1 to i8
|
||||
%3 = zext i8 %2 to i32
|
||||
%4 = ashr exact i32 %1, %3
|
||||
%5 = trunc i32 %4 to i8
|
||||
ret i8 %5
|
||||
}
|
||||
198
llvm/test/Analysis/DemandedBits/lshr.ll
Normal file
198
llvm/test/Analysis/DemandedBits/lshr.ll
Normal file
@ -0,0 +1,198 @@
|
||||
; RUN: opt -S -disable-output -passes="print<demanded-bits>" < %s 2>&1 | FileCheck %s
|
||||
|
||||
define i8 @test_lshr_const_amount_4(i32 %a) {
|
||||
; CHECK-LABEL: 'test_lshr_const_amount_4'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 4
|
||||
; CHECK-DAG: DemandedBits: 0xff0 for %a in %lshr = lshr i32 %a, 4
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 4 in %lshr = lshr i32 %a, 4
|
||||
;
|
||||
%lshr = lshr i32 %a, 4
|
||||
%lshr.t = trunc i32 %lshr to i8
|
||||
ret i8 %lshr.t
|
||||
}
|
||||
|
||||
define i8 @test_lshr_const_amount_5(i32 %a) {
|
||||
; CHECK-LABEL: 'test_lshr_const_amount_5'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 5
|
||||
; CHECK-DAG: DemandedBits: 0x1fe0 for %a in %lshr = lshr i32 %a, 5
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 5 in %lshr = lshr i32 %a, 5
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8
|
||||
;
|
||||
%lshr = lshr i32 %a, 5
|
||||
%lshr.t = trunc i32 %lshr to i8
|
||||
ret i8 %lshr.t
|
||||
}
|
||||
define i8 @test_lshr_const_amount_8(i32 %a) {
|
||||
; CHECK-LABEL: 'test_lshr_const_amount_8'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 8
|
||||
; CHECK-DAG: DemandedBits: 0xff00 for %a in %lshr = lshr i32 %a, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %lshr = lshr i32 %a, 8
|
||||
;
|
||||
%lshr = lshr i32 %a, 8
|
||||
%lshr.t = trunc i32 %lshr to i8
|
||||
ret i8 %lshr.t
|
||||
}
|
||||
|
||||
define i8 @test_lshr_const_amount_9(i32 %a) {
|
||||
; CHECK-LABEL: 'test_lshr_const_amount_9'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, 9
|
||||
; CHECK-DAG: DemandedBits: 0x1fe00 for %a in %lshr = lshr i32 %a, 9
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 9 in %lshr = lshr i32 %a, 9
|
||||
;
|
||||
%lshr = lshr i32 %a, 9
|
||||
%lshr.t = trunc i32 %lshr to i8
|
||||
ret i8 %lshr.t
|
||||
}
|
||||
|
||||
define i8 @test_lshr(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_lshr'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr in %lshr.t = trunc i32 %lshr to i8
|
||||
;
|
||||
%lshr = lshr i32 %a, %b
|
||||
%lshr.t = trunc i32 %lshr to i8
|
||||
ret i8 %lshr.t
|
||||
}
|
||||
|
||||
define i8 @test_lshr_range_1(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_lshr_range_1'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr in %shl.t = trunc i32 %lshr to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %lshr = lshr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0x7ff for %a in %lshr = lshr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %lshr = lshr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3
|
||||
;
|
||||
%b2 = and i32 %b, 3
|
||||
%lshr = lshr i32 %a, %b2
|
||||
%shl.t = trunc i32 %lshr to i8
|
||||
ret i8 %shl.t
|
||||
}
|
||||
|
||||
define i32 @test_lshr_range_2(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_lshr_range_2'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %lshr = lshr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %lshr = lshr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %lshr = lshr i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3
|
||||
;
|
||||
%b2 = and i32 %b, 3
|
||||
%lshr = lshr i32 %a, %b2
|
||||
ret i32 %lshr
|
||||
}
|
||||
|
||||
define i32 @test_lshr_range_3(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_lshr_range_3'
|
||||
; CHECK-DAG: DemandedBits: 0xffff for %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shl = shl i32 %lshr, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff for %lshr in %shl = shl i32 %lshr, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shl = shl i32 %lshr, 16
|
||||
;
|
||||
%lshr = lshr i32 %a, %b
|
||||
%shl = shl i32 %lshr, 16
|
||||
ret i32 %shl
|
||||
}
|
||||
|
||||
define i32 @test_lshr_range_4(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_lshr_range_4'
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %a in %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shr = ashr i32 %lshr, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %lshr in %shr = ashr i32 %lshr, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %shr = ashr i32 %lshr, 8
|
||||
%lshr = lshr i32 %a, %b
|
||||
%shr = ashr i32 %lshr, 8
|
||||
ret i32 %shr
|
||||
}
|
||||
|
||||
define i32 @test_lshr_range_5(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_lshr_range_5'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %2 = and i32 %1, 255
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = and i32 %1, 255
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 255 in %2 = and i32 %1, 255
|
||||
;
|
||||
%1 = lshr i32 %a, %b
|
||||
%2 = and i32 %1, 255
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
define i32 @test_lshr_range_6(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_lshr_range_6'
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %a in %lshr.1 = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %lshr.1 = lshr i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %lshr.2 = lshr i32 %lshr.1, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %lshr.1 in %lshr.2 = lshr i32 %lshr.1, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %lshr.2 = lshr i32 %lshr.1, 16
|
||||
;
|
||||
%lshr.1 = lshr i32 %a, %b
|
||||
%lshr.2 = lshr i32 %lshr.1, 16
|
||||
ret i32 %lshr.2
|
||||
}
|
||||
|
||||
|
||||
define i8 @test_lshr_var_amount(i32 %a, i32 %b){
|
||||
; CHECK-LABEL: 'test_lshr_var_amount'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 = lshr i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = lshr i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = lshr i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8
|
||||
;
|
||||
%1 = add nsw i32 %a, %b
|
||||
%2 = trunc i32 %1 to i8
|
||||
%3 = zext i8 %2 to i32
|
||||
%4 = lshr i32 %1, %3
|
||||
%5 = trunc i32 %4 to i8
|
||||
ret i8 %5
|
||||
}
|
||||
|
||||
define i8 @test_lshr_var_amount_exact(i32 %a, i32 %b){
|
||||
; CHECK-LABEL 'test_lshr_var_amount_nsw'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 = lshr exact i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = lshr exact i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = lshr exact i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8
|
||||
;
|
||||
%1 = add nsw i32 %a, %b
|
||||
%2 = trunc i32 %1 to i8
|
||||
%3 = zext i8 %2 to i32
|
||||
%4 = lshr exact i32 %1, %3
|
||||
%5 = trunc i32 %4 to i8
|
||||
ret i8 %5
|
||||
}
|
||||
@ -57,10 +57,142 @@ define i8 @test_shl(i32 %a, i32 %b) {
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %shl to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl in %shl.t = trunc i32 %shl to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %a in %shl = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl = shl i32 %a, %b
|
||||
;
|
||||
%shl = shl i32 %a, %b
|
||||
%shl.t = trunc i32 %shl to i8
|
||||
ret i8 %shl.t
|
||||
}
|
||||
|
||||
define i8 @test_shl_range_1(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_shl_range_1'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl = shl i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xff for %a in %shl = shl i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %shl = shl i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl.t = trunc i32 %shl to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %shl in %shl.t = trunc i32 %shl to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3
|
||||
;
|
||||
%b2 = and i32 %b, 3
|
||||
%shl = shl i32 %a, %b2
|
||||
%shl.t = trunc i32 %shl to i8
|
||||
ret i8 %shl.t
|
||||
}
|
||||
|
||||
define i32 @test_shl_range_2(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_shl_range_2'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0x3 for %b in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 3 in %b2 = and i32 %b, 3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shl = shl i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b2
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b2 in %shl = shl i32 %a, %b2
|
||||
;
|
||||
%b2 = and i32 %b, 3
|
||||
%shl = shl i32 %a, %b2
|
||||
ret i32 %shl
|
||||
}
|
||||
|
||||
define i32 @test_shl_range_3(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_shl_range_3'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shr = lshr i32 %shl, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %shl in %shr = lshr i32 %shl, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shr = lshr i32 %shl, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff0000 for %shl = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl = shl i32 %a, %b
|
||||
;
|
||||
%shl = shl i32 %a, %b
|
||||
%shr = lshr i32 %shl, 16
|
||||
ret i32 %shr
|
||||
}
|
||||
|
||||
define i32 @test_shl_range_4(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_shl_range_4'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shr = ashr i32 %shl, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %shl in %shr = ashr i32 %shl, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 8 in %shr = ashr i32 %shl, 8
|
||||
; CHECK-DAG: DemandedBits: 0xffffff00 for %shl = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %shl = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl = shl i32 %a, %b
|
||||
%shl = shl i32 %a, %b
|
||||
%shr = ashr i32 %shl, 8
|
||||
ret i32 %shr
|
||||
}
|
||||
|
||||
define i32 @test_shl_range_5(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_shl_range_5'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %a in %1 = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %2 = and i32 %1, 255
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = and i32 %1, 255
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 255 in %2 = and i32 %1, 255
|
||||
;
|
||||
%1 = shl i32 %a, %b
|
||||
%2 = and i32 %1, 255
|
||||
ret i32 %2
|
||||
}
|
||||
|
||||
define i32 @test_shl_range_6(i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: 'test_shl_range_6'
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %shl.2 = shl i32 %shl.1, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff for %shl.1 in %shl.2 = shl i32 %shl.1, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for 16 in %shl.2 = shl i32 %shl.1, 16
|
||||
; CHECK-DAG: DemandedBits: 0xffff for %shl.1 = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffff for %a in %shl.1 = shl i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %shl.1 = shl i32 %a, %b
|
||||
;
|
||||
%shl.1 = shl i32 %a, %b
|
||||
%shl.2 = shl i32 %shl.1, 16
|
||||
ret i32 %shl.2
|
||||
}
|
||||
|
||||
define i8 @test_shl_var_amount(i32 %a, i32 %b){
|
||||
; CHECK-LABEL: 'test_shl_var_amount'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 = shl i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %4 = shl i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = shl i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %a in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xff for %b in %1 = add nsw i32 %a, %b
|
||||
;
|
||||
%1 = add nsw i32 %a, %b
|
||||
%2 = trunc i32 %1 to i8
|
||||
%3 = zext i8 %2 to i32
|
||||
%4 = shl i32 %1, %3
|
||||
%5 = trunc i32 %4 to i8
|
||||
ret i8 %5
|
||||
}
|
||||
|
||||
define i8 @test_shl_var_amount_nsw(i32 %a, i32 %b){
|
||||
; CHECK-LABEL 'test_shl_var_amount_nsw'
|
||||
; CHECK-DAG: DemandedBits: 0xff for %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 in %5 = trunc i32 %4 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %4 = shl nsw i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 in %4 = shl nsw i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 in %4 = shl nsw i32 %1, %3
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 in %3 = zext i8 %2 to i32
|
||||
; CHECK-DAG: DemandedBits: 0xff for %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xff for %1 in %2 = trunc i32 %1 to i8
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %a in %1 = add nsw i32 %a, %b
|
||||
; CHECK-DAG: DemandedBits: 0xffffffff for %b in %1 = add nsw i32 %a, %b
|
||||
;
|
||||
%1 = add nsw i32 %a, %b
|
||||
%2 = trunc i32 %1 to i8
|
||||
%3 = zext i8 %2 to i32
|
||||
%4 = shl nsw i32 %1, %3
|
||||
%5 = trunc i32 %4 to i8
|
||||
ret i8 %5
|
||||
}
|
||||
|
||||
@ -164,7 +164,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: v_mul_i16_zeroext:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
|
||||
@ -2273,17 +2273,19 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
|
||||
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
|
||||
@ -2293,14 +2295,16 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -4502,8 +4506,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
|
||||
@ -4518,17 +4522,19 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2
|
||||
; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true
|
||||
@ -4538,14 +4544,16 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -6459,8 +6467,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
|
||||
@ -6475,17 +6483,19 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
|
||||
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
|
||||
@ -6495,14 +6505,16 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -8104,8 +8116,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
|
||||
@ -8120,17 +8132,19 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
|
||||
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
|
||||
@ -8140,14 +8154,16 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -9463,8 +9479,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
|
||||
@ -9479,17 +9495,19 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2
|
||||
; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true
|
||||
@ -9499,14 +9517,16 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -10173,8 +10193,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
|
||||
@ -10189,17 +10209,19 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
|
||||
; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true
|
||||
@ -10209,14 +10231,16 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -3065,12 +3065,13 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
|
||||
@ -3084,53 +3085,61 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
|
||||
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -6205,12 +6214,13 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
|
||||
@ -6224,53 +6234,61 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
|
||||
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -9045,12 +9063,13 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
|
||||
@ -9064,53 +9083,61 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
|
||||
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -11576,12 +11603,13 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
|
||||
@ -11595,53 +11623,61 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
|
||||
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -13793,12 +13829,13 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
|
||||
@ -13812,53 +13849,61 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
|
||||
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -15610,12 +15655,13 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
|
||||
@ -15629,53 +15675,61 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
|
||||
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -16912,12 +16966,13 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
|
||||
@ -16931,53 +16986,61 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
|
||||
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
||||
@ -1102,16 +1102,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
|
||||
@ -1126,74 +1125,80 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
|
||||
; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -4236,16 +4241,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
|
||||
@ -4260,74 +4264,80 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
|
||||
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -6875,16 +6885,16 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
|
||||
@ -6899,74 +6909,80 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2
|
||||
; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -8635,16 +8651,16 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
|
||||
@ -8659,74 +8675,80 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2
|
||||
; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -10043,16 +10065,16 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
||||
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
|
||||
@ -10067,74 +10089,80 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
|
||||
; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
|
||||
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
|
||||
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
|
||||
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2
|
||||
; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc %s -o %t.o -mcpu=gfx1030 -filetype=obj -O0
|
||||
; RUN: llvm-debuginfo-analyzer %t.o --print=all --attribute=all | FileCheck %s
|
||||
|
||||
|
||||
@ -9022,12 +9022,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
|
||||
; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0
|
||||
; GFX1164-TRUE16-NEXT: .LBB15_2:
|
||||
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
|
||||
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
|
||||
@ -9100,12 +9101,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
|
||||
; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0
|
||||
; GFX1132-TRUE16-NEXT: .LBB15_2:
|
||||
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
|
||||
; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
|
||||
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
|
||||
@ -9178,12 +9180,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
|
||||
; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0
|
||||
; GFX1264-TRUE16-NEXT: .LBB15_2:
|
||||
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
|
||||
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
|
||||
@ -9256,12 +9259,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
|
||||
; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0
|
||||
; GFX1232-TRUE16-NEXT: .LBB15_2:
|
||||
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
|
||||
; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
|
||||
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
|
||||
@ -9658,11 +9662,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
|
||||
; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
|
||||
; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow
|
||||
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
|
||||
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
|
||||
@ -9784,11 +9789,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
|
||||
; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
|
||||
; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow
|
||||
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
|
||||
; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
|
||||
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
|
||||
@ -9910,12 +9916,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
|
||||
; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
|
||||
; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow
|
||||
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
|
||||
; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
|
||||
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
|
||||
@ -10041,12 +10048,13 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
|
||||
; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
|
||||
; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow
|
||||
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
|
||||
; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
|
||||
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
|
||||
@ -10726,15 +10734,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
|
||||
; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
|
||||
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0
|
||||
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
|
||||
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -10820,14 +10828,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
|
||||
; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1
|
||||
; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
|
||||
; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0
|
||||
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
|
||||
; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
|
||||
; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -10912,15 +10920,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
|
||||
; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
|
||||
; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0
|
||||
; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
|
||||
; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
@ -11006,14 +11014,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
|
||||
; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1
|
||||
; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
|
||||
; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
|
||||
; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0
|
||||
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
|
||||
; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
|
||||
; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
|
||||
@ -37774,10 +37774,9 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
|
||||
; GFX11TRUE16-LABEL: v_uitofp_i16_to_bf16:
|
||||
; GFX11TRUE16: ; %bb.0:
|
||||
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v1
|
||||
; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
|
||||
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
|
||||
@ -40751,11 +40750,12 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
|
||||
;
|
||||
; GFX11TRUE16-LABEL: s_select_bf16:
|
||||
; GFX11TRUE16: ; %bb.0:
|
||||
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
|
||||
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
|
||||
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v1.l, vcc_lo
|
||||
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
|
||||
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
|
||||
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX11TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
|
||||
@ -3443,14 +3443,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
|
||||
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
@ -3568,13 +3569,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
|
||||
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -3882,14 +3884,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
|
||||
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
@ -4004,13 +4007,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
|
||||
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -4324,14 +4328,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
|
||||
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
|
||||
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
|
||||
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
@ -4551,14 +4556,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
|
||||
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
|
||||
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
|
||||
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
|
||||
@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
|
||||
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
@ -2640,19 +2640,20 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
|
||||
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
|
||||
; GFX11-TRUE16-NEXT: .p2align 6
|
||||
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
|
||||
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2972,16 +2973,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
|
||||
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
@ -3097,19 +3098,20 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
|
||||
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
|
||||
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
|
||||
; GFX11-TRUE16-NEXT: .p2align 6
|
||||
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
|
||||
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -3435,16 +3437,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
|
||||
; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
|
||||
; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
|
||||
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
@ -3670,16 +3672,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
|
||||
; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
|
||||
; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
|
||||
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
|
||||
@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
|
||||
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
@ -2640,19 +2640,20 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
|
||||
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
|
||||
; GFX11-TRUE16-NEXT: .p2align 6
|
||||
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
|
||||
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2972,16 +2973,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
|
||||
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
@ -3097,19 +3098,20 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
|
||||
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
|
||||
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
|
||||
; GFX11-TRUE16-NEXT: .p2align 6
|
||||
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
|
||||
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -3435,16 +3437,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
|
||||
; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
|
||||
; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
|
||||
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
@ -3670,16 +3672,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
|
||||
; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
|
||||
; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
|
||||
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
|
||||
@ -2745,15 +2745,6 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
|
||||
;
|
||||
; GFX11-TRUE16-LABEL: amdgpu_cs_v32i1:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, v26.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, v22.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, v18.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, v10.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 1, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, v8.l, 1
|
||||
@ -2763,18 +2754,6 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 3, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 2, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v24.l, v25.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 3, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 2, v20.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v19.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 2, v17.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, v14.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 1, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, v12.l, 1
|
||||
@ -2787,15 +2766,15 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 3, v31.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 2, v30.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, v22.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v22.l, v20.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, v16.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.h, v17.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, v26.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, v22.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 2, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v12.l, v13.l
|
||||
@ -2805,42 +2784,65 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v28.h, v29.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v28.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v21.h, v25.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v18.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 3, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 2, v25.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v25.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 3, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 2, v22.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v19.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.h, v10.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v8.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v26.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v19.l, 15
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 4, v16.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 15
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 3, v31.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 2, v30.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v28.l, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v25.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v23.l, v22.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v16.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v18.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v1.l, 15
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 12, v24.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v14.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v30.h, v28.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v24.h, 3
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v24.l, v22.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v14.h, v18.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v16.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v28.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v20.h, 15
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 4, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 15
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v23.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
|
||||
; GFX11-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -1561,10 +1561,10 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp
|
||||
; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-TRUE16-NEXT: s_endpgm
|
||||
|
||||
@ -946,9 +946,9 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
|
||||
; GFX11-TRUE16-LABEL: v_uitofp_i8_to_f64:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -1770,38 +1770,40 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
|
||||
; GFX11-TRUE16-LABEL: load_v4i8_to_v4f32_2_uses:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
|
||||
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff00, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x900, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x900, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1]
|
||||
; GFX11-TRUE16-NEXT: global_store_b32 v6, v4, s[2:3]
|
||||
; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1]
|
||||
; GFX11-TRUE16-NEXT: global_store_b32 v5, v4, s[2:3]
|
||||
; GFX11-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: load_v4i8_to_v4f32_2_uses:
|
||||
|
||||
@ -2536,13 +2536,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
|
||||
; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
|
||||
; GFX11-SDAG: ; %bb.0:
|
||||
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l
|
||||
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
|
||||
; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
|
||||
; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15
|
||||
; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
|
||||
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0
|
||||
|
||||
@ -8410,12 +8410,13 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8528,12 +8529,13 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -8783,12 +8785,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8905,12 +8908,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -9167,12 +9171,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -9290,12 +9295,13 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -9551,11 +9557,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -9665,11 +9671,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -9911,11 +9917,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -10029,11 +10035,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -10282,11 +10288,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -10401,11 +10407,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -10645,8 +10651,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -10729,8 +10735,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -10919,9 +10925,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -11007,9 +11014,10 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -11212,12 +11220,13 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -11336,12 +11345,13 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -11600,11 +11610,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -11720,11 +11730,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
|
||||
@ -6043,14 +6043,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6168,14 +6168,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -6438,14 +6438,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6570,14 +6570,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -6847,14 +6847,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6980,14 +6980,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -7254,12 +7254,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7375,12 +7376,13 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -7636,12 +7638,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7764,12 +7767,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -8032,12 +8036,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8161,12 +8166,13 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -8418,11 +8424,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8513,11 +8519,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -8722,9 +8728,10 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8813,9 +8820,10 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -9027,14 +9035,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -9161,14 +9169,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -9440,12 +9448,13 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -9570,12 +9579,13 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
|
||||
@ -6043,14 +6043,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6168,14 +6168,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -6438,14 +6438,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6570,14 +6570,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -6847,14 +6847,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6980,14 +6980,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -7254,12 +7254,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7375,12 +7376,13 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -7636,12 +7638,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7764,12 +7767,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -8032,12 +8036,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8161,12 +8166,13 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -8418,11 +8424,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8513,11 +8519,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -8722,9 +8728,10 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8813,9 +8820,10 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -9027,14 +9035,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -9161,14 +9169,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
@ -9440,12 +9448,13 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -9570,12 +9579,13 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
|
||||
|
||||
@ -5855,12 +5855,13 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5973,12 +5974,13 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -6228,12 +6230,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6350,12 +6353,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -6612,12 +6616,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6735,12 +6740,13 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -6996,11 +7002,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7110,11 +7116,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -7356,11 +7362,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7474,11 +7480,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -7727,11 +7733,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7846,11 +7852,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
@ -8090,9 +8096,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8178,9 +8185,10 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -8374,8 +8382,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8458,8 +8466,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
|
||||
@ -8657,12 +8665,13 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -8781,12 +8790,13 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
|
||||
@ -9045,11 +9055,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -9165,11 +9175,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
|
||||
|
||||
@ -4238,7 +4238,7 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
|
||||
; GFX11-GISEL-TRUE16-LABEL: s_mul_32_f16:
|
||||
; GFX11-GISEL-TRUE16: ; %bb.0:
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e64 v0.l, 0x5000, s0
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
|
||||
;
|
||||
|
||||
@ -644,10 +644,11 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
|
||||
; GFX11-TRUE16-LABEL: fmul_pow_mul_max_pow2:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1]
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -1193,12 +1194,13 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
|
||||
; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_safe:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0xff5f3992
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0x7befffff
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1]
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
||||
@ -4372,13 +4372,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
|
||||
; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32:
|
||||
; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
@ -4606,13 +4607,14 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
|
||||
; GFX11-GISEL-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
|
||||
; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |s2|
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -1107,19 +1107,21 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
|
||||
; GFX11-TRUE16-LABEL: void_func_v4i8:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -1188,20 +1190,22 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
|
||||
; GFX11-TRUE16-LABEL: void_func_v5i8:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -1281,27 +1285,29 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
|
||||
; GFX11-TRUE16-LABEL: void_func_v8i8:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v0.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4
|
||||
; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -1416,44 +1422,47 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
|
||||
; GFX11-TRUE16-LABEL: void_func_v16i8:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v15.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v8.h, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v10.l, v8.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v11
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v4, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14
|
||||
; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v12
|
||||
; GFX11-TRUE16-NEXT: buffer_store_b128 v[6:9], off, s[0:3], 0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: void_func_v16i8:
|
||||
@ -1649,77 +1658,83 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v28.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v25.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v24.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v30.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v11.h, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v8.h, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v6.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v31.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v9.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v19.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.h, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v11, v32
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.l, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32
|
||||
; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
|
||||
|
||||
@ -4896,22 +4896,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
|
||||
@ -5155,29 +5156,30 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v6
|
||||
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off
|
||||
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
|
||||
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
|
||||
@ -5439,34 +5441,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
|
||||
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v5
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v5
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
|
||||
; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
|
||||
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
|
||||
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
|
||||
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
|
||||
@ -5906,77 +5910,85 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, v34
|
||||
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, 0
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v14.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v9.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v0.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v1.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v13
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v13
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v29.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v13
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v30.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v13
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v26.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v27.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v21.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v13
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v17.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v23.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v22.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v7, v13
|
||||
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v18.l
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v10
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v19.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v13
|
||||
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v10
|
||||
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v13
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x1
|
||||
; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off
|
||||
; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off
|
||||
; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[6:9], off
|
||||
; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[2:5], off
|
||||
; GFX11-TRUE16-NEXT: s_clause 0x3
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33
|
||||
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4
|
||||
|
||||
@ -8275,12 +8275,13 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8393,12 +8394,13 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -8698,12 +8700,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8820,12 +8823,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -9134,12 +9138,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -9257,12 +9262,13 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -9570,11 +9576,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -9684,11 +9690,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -9979,11 +9985,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -10097,11 +10103,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -10400,11 +10406,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -10519,11 +10525,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -10813,9 +10819,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -10901,9 +10908,10 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -11136,8 +11144,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -11220,8 +11228,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -11456,12 +11464,13 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -11580,12 +11589,13 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -11896,11 +11906,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -12016,11 +12026,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
|
||||
@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -5832,12 +5832,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5953,12 +5954,13 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -6263,12 +6265,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6391,12 +6394,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -6709,12 +6713,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6838,12 +6843,13 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -7145,11 +7151,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7240,11 +7246,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -7488,9 +7494,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7579,9 +7586,10 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -7830,14 +7838,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -7964,14 +7972,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -8295,12 +8303,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -8425,12 +8434,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
|
||||
@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -5832,12 +5832,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5953,12 +5954,13 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -6263,12 +6265,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6391,12 +6394,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -6709,12 +6713,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6838,12 +6843,13 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -7145,11 +7151,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7240,11 +7246,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -7488,9 +7494,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7579,9 +7586,10 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -7830,14 +7838,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -7964,14 +7972,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
@ -8295,12 +8303,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -8425,12 +8434,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
|
||||
|
||||
@ -5221,12 +5221,13 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5339,12 +5340,13 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -5644,12 +5646,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -5766,12 +5769,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -6080,12 +6084,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6203,12 +6208,13 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -6516,11 +6522,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -6630,11 +6636,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -6925,11 +6931,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7043,11 +7049,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -7346,11 +7352,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7465,11 +7471,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
@ -7759,9 +7765,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -7847,9 +7854,10 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -8082,8 +8090,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
|
||||
@ -8166,8 +8174,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
|
||||
@ -8402,12 +8410,13 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -8526,12 +8535,13 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
|
||||
@ -8842,11 +8852,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
@ -8962,11 +8972,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
|
||||
|
||||
@ -1693,11 +1693,12 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0
|
||||
; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5]
|
||||
; GFX11-DL-TRUE16-NEXT: s_endpgm
|
||||
@ -2723,32 +2724,32 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
|
||||
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
|
||||
; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5]
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v4.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4
|
||||
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v2.l, v6.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.h
|
||||
; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v0.h, v1.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
|
||||
; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
|
||||
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.h, v4.h, v0.l
|
||||
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-DL-TRUE16-NEXT: global_store_b8 v5, v0, s[4:5]
|
||||
; GFX11-DL-TRUE16-NEXT: s_endpgm
|
||||
|
||||
@ -1715,9 +1715,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16:
|
||||
@ -1745,7 +1745,8 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16:
|
||||
@ -1776,9 +1777,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16:
|
||||
@ -1814,7 +1815,8 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16:
|
||||
@ -9361,9 +9363,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
|
||||
@ -9407,7 +9409,8 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
|
||||
@ -9454,9 +9457,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
|
||||
@ -9508,7 +9511,8 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
|
||||
; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
declare i32 @llvm.amdgcn.prng.b32(i32) #0
|
||||
@ -29,6 +29,13 @@ define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}prng_undef_i32:
|
||||
; SDAG-NOT: v_prng_b32
|
||||
define amdgpu_kernel void @prng_undef_i32(ptr addrspace(1) %out) #1 {
|
||||
%prng = call i32 @llvm.amdgcn.prng.b32(i32 undef)
|
||||
store i32 %prng, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
||||
@ -1259,12 +1259,13 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -1370,12 +1371,13 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -1644,12 +1646,13 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -1760,12 +1763,13 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -2040,12 +2044,13 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -2148,12 +2153,13 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -2413,11 +2419,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -2525,11 +2531,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -2789,9 +2795,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -2875,9 +2882,10 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -3087,8 +3095,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
@ -3169,8 +3177,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
|
||||
@ -803,14 +803,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -918,14 +918,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -1199,14 +1199,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -1319,14 +1319,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -1606,14 +1606,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -1718,14 +1718,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -1990,12 +1990,13 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -2106,12 +2107,13 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -2377,11 +2379,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -2467,11 +2469,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -2686,9 +2688,10 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
@ -2772,9 +2775,10 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
|
||||
@ -803,14 +803,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -918,14 +918,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -1199,14 +1199,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -1319,14 +1319,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -1606,14 +1606,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -1718,14 +1718,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -1990,12 +1990,13 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -2106,12 +2107,13 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -2377,11 +2379,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -2467,11 +2469,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -2686,9 +2688,10 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
@ -2772,9 +2775,10 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
|
||||
@ -1721,12 +1721,13 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -1832,12 +1833,13 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
|
||||
@ -2106,12 +2108,13 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -2222,12 +2225,13 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
|
||||
@ -2502,12 +2506,13 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -2610,12 +2615,13 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
|
||||
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
|
||||
@ -2875,11 +2881,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -2987,11 +2993,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
|
||||
@ -3251,9 +3257,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -3337,9 +3344,10 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
|
||||
@ -3549,8 +3557,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
|
||||
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
|
||||
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
@ -3631,8 +3639,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
|
||||
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
|
||||
|
||||
@ -223,3 +223,43 @@ define i32 @fancy_zero() {
|
||||
ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7))
|
||||
to i32)
|
||||
}
|
||||
|
||||
define i32 @load_null() {
|
||||
; CHECK-LABEL: define i32 @load_null
|
||||
; CHECK-SAME: () #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 null, i32 0, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret i32 [[X]]
|
||||
;
|
||||
%x = load i32, ptr addrspace(7) null, align 4
|
||||
ret i32 %x
|
||||
}
|
||||
|
||||
define void @store_null() {
|
||||
; CHECK-LABEL: define void @store_null
|
||||
; CHECK-SAME: () #[[ATTR0]] {
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 0, ptr addrspace(8) align 4 null, i32 0, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
store i32 0, ptr addrspace(7) null, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @load_poison() {
|
||||
; CHECK-LABEL: define i32 @load_poison
|
||||
; CHECK-SAME: () #[[ATTR0]] {
|
||||
; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 poison, i32 poison, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret i32 [[X]]
|
||||
;
|
||||
%x = load i32, ptr addrspace(7) poison, align 4
|
||||
ret i32 %x
|
||||
}
|
||||
|
||||
define void @store_poison() {
|
||||
; CHECK-LABEL: define void @store_poison
|
||||
; CHECK-SAME: () #[[ATTR0]] {
|
||||
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 0, ptr addrspace(8) align 4 poison, i32 poison, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
store i32 0, ptr addrspace(7) poison, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -2382,22 +2382,13 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
|
||||
}
|
||||
|
||||
define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
|
||||
; SDAG-GFX1100-TRUE16-LABEL: mixlo_zext:
|
||||
; SDAG-GFX1100-TRUE16: ; %bb.0:
|
||||
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2
|
||||
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
|
||||
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-GFX1100-FAKE16-LABEL: mixlo_zext:
|
||||
; SDAG-GFX1100-FAKE16: ; %bb.0:
|
||||
; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
|
||||
; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; SDAG-GFX1100-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX1100-LABEL: mixlo_zext:
|
||||
; GFX1100: ; %bb.0:
|
||||
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
|
||||
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX1100-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX900-LABEL: mixlo_zext:
|
||||
; GFX900: ; %bb.0:
|
||||
@ -2427,14 +2418,6 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
|
||||
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX1100-LABEL: mixlo_zext:
|
||||
; GISEL-GFX1100: ; %bb.0:
|
||||
; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
|
||||
; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: mixlo_zext:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
|
||||
@ -179,7 +179,8 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: v_mad_u16_zext:
|
||||
@ -221,9 +222,9 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
|
||||
; GFX11-TRUE16-LABEL: v_mad_u16_zext64:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: v_mad_u16_zext64:
|
||||
|
||||
@ -374,7 +374,7 @@ define i32 @shl_i16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: shl_i16_zext_i32:
|
||||
@ -412,7 +412,7 @@ define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: lshr_i16_zext_i32:
|
||||
@ -450,7 +450,7 @@ define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, v1.l, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: ashr_i16_zext_i32:
|
||||
@ -488,7 +488,7 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: add_u16_zext_i32:
|
||||
@ -526,7 +526,7 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: sub_u16_zext_i32:
|
||||
@ -564,7 +564,7 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: mul_lo_u16_zext_i32:
|
||||
@ -602,7 +602,7 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: min_u16_zext_i32:
|
||||
@ -641,7 +641,7 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: min_i16_zext_i32:
|
||||
@ -680,7 +680,7 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: max_u16_zext_i32:
|
||||
@ -719,7 +719,7 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: max_i16_zext_i32:
|
||||
@ -758,7 +758,7 @@ define i32 @zext_fadd_f16(half %x, half %y) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: zext_fadd_f16:
|
||||
@ -797,10 +797,8 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) {
|
||||
; GFX11-TRUE16-LABEL: zext_fma_f16:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v0.h, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: zext_fma_f16:
|
||||
@ -840,7 +838,7 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, v2.l
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: zext_div_fixup_f16:
|
||||
@ -882,7 +880,7 @@ define i32 @zext_fptrunc_f16(float %x) {
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: zext_fptrunc_f16:
|
||||
@ -926,20 +924,12 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-TRUE16-LABEL: zext_fptrunc_fma_f16:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
|
||||
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-FAKE16-LABEL: zext_fptrunc_fma_f16:
|
||||
; GFX11-FAKE16: ; %bb.0:
|
||||
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
|
||||
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
||||
; GFX11-LABEL: zext_fptrunc_fma_f16:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%fma = call float @llvm.fma.f32(float %x, float %y, float %z)
|
||||
%fptrunc = fptrunc float %fma to half
|
||||
%cast = bitcast half %fptrunc to i16
|
||||
@ -950,5 +940,3 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
|
||||
declare half @llvm.amdgcn.div.fixup.f16(half, half, half)
|
||||
declare half @llvm.fma.f16(half, half, half)
|
||||
declare float @llvm.fma.f32(float, float, float)
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX11: {{.*}}
|
||||
|
||||
@ -1528,9 +1528,10 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
@ -1559,9 +1560,10 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user