llvm-project/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
Valentin Clement (バレンタイン クレメン) 26b4c25b8b
[flang][cuda] Add support for cudaStreamDestroy (#183648)
Add specific lowering and entry point for cudaStreamDestroy. Since we
keep associated stream for some allocation, we need to reset it when the
stream is destroy so we don't use it anymore.
2026-02-27 00:24:29 +00:00

1856 lines
74 KiB
C++

//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Helper routines for constructing the FIR dialect of MLIR for PowerPC
// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style
// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this
// module.
//
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
#include "flang/Evaluate/common.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/MutableBox.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Runtime/entry-names.h"
#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
namespace fir {
using CI = CUDAIntrinsicLibrary;
static const char __ldca_i4x4[] = "__ldca_i4x4_";
static const char __ldca_i8x2[] = "__ldca_i8x2_";
static const char __ldca_r2x2[] = "__ldca_r2x2_";
static const char __ldca_r4x4[] = "__ldca_r4x4_";
static const char __ldca_r8x2[] = "__ldca_r8x2_";
static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
static constexpr unsigned kTMAAlignment = 16;
// CUDA specific intrinsic handlers.
static constexpr IntrinsicHandler cudaHandlers[]{
{"__ldca_i4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldca_i4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldca_i8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldca_i8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldca_r2x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldca_r2x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldca_r4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldca_r4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldca_r8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldca_r8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcg_i4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcg_i4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcg_i8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcg_i8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcg_r2x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcg_r2x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcg_r4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcg_r4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcg_r8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcg_r8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcs_i4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcs_i4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcs_i8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcs_i8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcs_r2x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcs_r2x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcs_r4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcs_r4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcs_r8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcs_r8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcv_i4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcv_i4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcv_i8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcv_i8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcv_r2x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcv_r2x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcv_r4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcv_r4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldcv_r8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldcv_r8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldlu_i4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldlu_i4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldlu_i8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldlu_i8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldlu_r2x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldlu_r2x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldlu_r4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldlu_r4x4, 4>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"__ldlu_r8x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genLDXXFunc<__ldlu_r8x2, 2>),
{{{"a", asAddr}}},
/*isElemental=*/false},
{"all_sync",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>),
{{{"mask", asValue}, {"pred", asValue}}},
/*isElemental=*/false},
{"any_sync",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>),
{{{"mask", asValue}, {"pred", asValue}}},
/*isElemental=*/false},
{"atomicadd_r4x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genAtomicAddVector<2>),
{{{"a", asAddr}, {"v", asAddr}}},
false},
{"atomicadd_r4x4",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genAtomicAddVector4x4),
{{{"a", asAddr}, {"v", asAddr}}},
false},
{"atomicaddd",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicaddf",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicaddi",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicaddl",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicaddr2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2),
{{{"a", asAddr}, {"v", asAddr}}},
false},
{"atomicaddvector_r2x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genAtomicAddVector<2>),
{{{"a", asAddr}, {"v", asAddr}}},
false},
{"atomicaddvector_r4x2",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genAtomicAddVector<2>),
{{{"a", asAddr}, {"v", asAddr}}},
false},
{"atomicandi",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomiccasd",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
{{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
false},
{"atomiccasf",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
{{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
false},
{"atomiccasi",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
{{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
false},
{"atomiccasul",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
{{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
false},
{"atomicdeci",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicexchd",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicexchf",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicexchi",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicexchul",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicinci",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicmaxd",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicmaxf",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicmaxi",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicmaxl",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicmind",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicminf",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicmini",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicminl",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicori",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicsubd",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicsubf",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicsubi",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicsubl",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"atomicxori",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor),
{{{"a", asAddr}, {"v", asValue}}},
false},
{"ballot_sync",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>),
{{{"mask", asValue}, {"pred", asValue}}},
/*isElemental=*/false},
{"barrier_arrive",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genBarrierArrive),
{{{"barrier", asAddr}}},
/*isElemental=*/false},
{"barrier_arrive_cnt",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genBarrierArriveCnt),
{{{"barrier", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"barrier_init",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genBarrierInit),
{{{"barrier", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"barrier_try_wait",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genBarrierTryWait),
{{{"barrier", asAddr}, {"token", asValue}}},
/*isElemental=*/false},
{"barrier_try_wait_sleep",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genBarrierTryWaitSleep),
{{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
/*isElemental=*/false},
{"clock",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genNVVMTime<mlir::NVVM::ClockOp>),
{},
/*isElemental=*/false},
{"clock64",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genNVVMTime<mlir::NVVM::Clock64Op>),
{},
/*isElemental=*/false},
{"cluster_block_index",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genClusterBlockIndex),
{},
/*isElemental=*/false},
{"cluster_dim_blocks",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genClusterDimBlocks),
{},
/*isElemental=*/false},
{"cudagetstreamdefaultarg",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genCUDAGetDefaultStreamArg),
{{{"devptr", asAddr}}},
/*isElemental=*/false},
{"cudagetstreamdefaultnull",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genCUDAGetDefaultStreamNull),
{},
/*isElemental=*/false},
{"cudasetstreamarray",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genCUDASetDefaultStreamArray),
{{{"devptr", asAddr}, {"stream", asValue}}},
/*isElemental=*/false},
{"cudasetstreamdefault",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genCUDASetDefaultStream),
{{{"stream", asValue}}},
/*isElemental=*/false},
{"cudastreamdestroy",
static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
&CI::genCUDAStreamDestroy),
{{{"stream", asValue}}},
/*isElemental=*/false},
{"fence_proxy_async",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genFenceProxyAsync),
{},
/*isElemental=*/false},
{"globaltimer",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>),
{},
/*isElemental=*/false},
{"match_all_syncjd",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAllSync),
{{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
/*isElemental=*/false},
{"match_all_syncjf",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAllSync),
{{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
/*isElemental=*/false},
{"match_all_syncjj",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAllSync),
{{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
/*isElemental=*/false},
{"match_all_syncjx",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAllSync),
{{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
/*isElemental=*/false},
{"match_any_syncjd",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAnySync),
{{{"mask", asValue}, {"value", asValue}}},
/*isElemental=*/false},
{"match_any_syncjf",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAnySync),
{{{"mask", asValue}, {"value", asValue}}},
/*isElemental=*/false},
{"match_any_syncjj",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAnySync),
{{{"mask", asValue}, {"value", asValue}}},
/*isElemental=*/false},
{"match_any_syncjx",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genMatchAnySync),
{{{"mask", asValue}, {"value", asValue}}},
/*isElemental=*/false},
{"syncthreads",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genSyncThreads),
{},
/*isElemental=*/false},
{"syncthreads_and_i4",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genSyncThreadsAnd),
{},
/*isElemental=*/false},
{"syncthreads_and_l4",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genSyncThreadsAnd),
{},
/*isElemental=*/false},
{"syncthreads_count_i4",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genSyncThreadsCount),
{},
/*isElemental=*/false},
{"syncthreads_count_l4",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genSyncThreadsCount),
{},
/*isElemental=*/false},
{"syncthreads_or_i4",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genSyncThreadsOr),
{},
/*isElemental=*/false},
{"syncthreads_or_l4",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genSyncThreadsOr),
{},
/*isElemental=*/false},
{"syncwarp",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
{},
/*isElemental=*/false},
{"this_cluster",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisCluster),
{},
/*isElemental=*/false},
{"this_grid",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
{},
/*isElemental=*/false},
{"this_thread_block",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
&CI::genThisThreadBlock),
{},
/*isElemental=*/false},
{"this_warp",
static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp),
{},
/*isElemental=*/false},
{"threadfence",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genThreadFence<mlir::NVVM::MemScopeKind::GPU>),
{},
/*isElemental=*/false},
{"threadfence_block",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genThreadFence<mlir::NVVM::MemScopeKind::CTA>),
{},
/*isElemental=*/false},
{"threadfence_system",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genThreadFence<mlir::NVVM::MemScopeKind::SYS>),
{},
/*isElemental=*/false},
{"tma_bulk_commit_group",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkCommitGroup),
{{}},
/*isElemental=*/false},
{"tma_bulk_g2s",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nbytes", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldc4",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkLoadC4),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldc8",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkLoadC8),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldi4",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkLoadI4),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldi8",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkLoadI8),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldr2",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkLoadR2),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldr4",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkLoadR4),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_ldr8",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkLoadR8),
{{{"barrier", asAddr},
{"src", asAddr},
{"dst", asAddr},
{"nelems", asValue}}},
/*isElemental=*/false},
{"tma_bulk_s2g",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G),
{{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
/*isElemental=*/false},
{"tma_bulk_store_c4",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkStoreC4),
{{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"tma_bulk_store_c8",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkStoreC8),
{{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"tma_bulk_store_i4",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkStoreI4),
{{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"tma_bulk_store_i8",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkStoreI8),
{{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"tma_bulk_store_r2",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkStoreR2),
{{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"tma_bulk_store_r4",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkStoreR4),
{{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"tma_bulk_store_r8",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkStoreR8),
{{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
/*isElemental=*/false},
{"tma_bulk_wait_group",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
&CI::genTMABulkWaitGroup),
{{}},
/*isElemental=*/false},
};
template <std::size_t N>
static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) {
// Replace by std::sorted when C++20 is default (will be constexpr).
const IntrinsicHandler *lastSeen{nullptr};
bool isSorted{true};
for (const auto &x : array) {
if (lastSeen)
isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name};
lastSeen = &x;
}
return isSorted;
}
static_assert(isSorted(cudaHandlers) && "map must be sorted");
const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) {
auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) {
return name.compare(cudaHandler.name) > 0;
};
auto result = llvm::lower_bound(cudaHandlers, name, compare);
return result != std::end(cudaHandlers) && result->name == name ? result
: nullptr;
}
static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
mlir::Location loc,
mlir::Value barrier,
mlir::NVVM::NVVMMemorySpace space) {
mlir::Value llvmPtr = fir::ConvertOp::create(
builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
barrier);
mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
builder, loc,
mlir::LLVM::LLVMPointerType::get(builder.getContext(),
static_cast<unsigned>(space)),
llvmPtr);
return addrCast;
}
static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
mlir::Value arg1) {
auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
arg0 = builder.createConvert(loc, llvmPointerType, arg0);
return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
mlir::LLVM::AtomicOrdering::seq_cst);
}
// ATOMICADD
mlir::Value
CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
mlir::LLVM::AtomicBinOp binOp =
mlir::isa<mlir::IntegerType>(args[1].getType())
? mlir::LLVM::AtomicBinOp::add
: mlir::LLVM::AtomicBinOp::fadd;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
fir::ExtendedValue
CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
mlir::Value a = fir::getBase(args[0]);
if (mlir::isa<fir::BaseBoxType>(a.getType())) {
a = fir::BoxAddrOp::create(builder, loc, a);
}
auto loc = builder.getUnknownLoc();
auto f16Ty = builder.getF16Type();
auto i32Ty = builder.getI32Type();
auto vecF16Ty = mlir::VectorType::get({2}, f16Ty);
mlir::Type idxTy = builder.getIndexType();
auto f16RefTy = fir::ReferenceType::get(f16Ty);
auto zero = builder.createIntegerConstant(loc, idxTy, 0);
auto one = builder.createIntegerConstant(loc, idxTy, 1);
auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
fir::getBase(args[1]), zero);
auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
fir::getBase(args[1]), one);
auto v1 = fir::LoadOp::create(builder, loc, v1Coord);
auto v2 = fir::LoadOp::create(builder, loc, v2Coord);
mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty);
mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
auto i32VecTy = mlir::VectorType::get({1}, i32Ty);
mlir::Value vecI32 =
mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res);
return mlir::vector::ExtractOp::create(builder, loc, vecI32,
mlir::ArrayRef<int64_t>{0});
}
// ATOMICADDVECTOR
template <int extent>
fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector(
mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
mlir::Value res = fir::AllocaOp::create(
builder, loc, fir::SequenceType::get({extent}, resultType));
mlir::Value a = fir::getBase(args[0]);
if (mlir::isa<fir::BaseBoxType>(a.getType())) {
a = fir::BoxAddrOp::create(builder, loc, a);
}
auto vecTy = mlir::VectorType::get({extent}, resultType);
auto refTy = fir::ReferenceType::get(resultType);
mlir::Type i32Ty = builder.getI32Type();
mlir::Type idxTy = builder.getIndexType();
// Extract the values from the array.
llvm::SmallVector<mlir::Value> values;
for (unsigned i = 0; i < extent; ++i) {
mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
fir::getBase(args[1]), pos);
mlir::Value value = fir::LoadOp::create(builder, loc, coord);
values.push_back(value);
}
// Pack extracted values into a vector to call the atomic add.
mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
for (unsigned i = 0; i < extent; ++i) {
mlir::Value insert = mlir::LLVM::InsertElementOp::create(
builder, loc, undef, values[i],
builder.createIntegerConstant(loc, i32Ty, i));
undef = insert;
}
// Atomic operation with a vector of values.
mlir::Value add =
genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef);
// Store results in the result array.
for (unsigned i = 0; i < extent; ++i) {
mlir::Value r = mlir::LLVM::ExtractElementOp::create(
builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i));
mlir::Value c = fir::CoordinateOp::create(
builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i));
fir::StoreOp::create(builder, loc, r, c);
}
mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent);
return fir::ArrayBoxValue(res, {ext});
}
// ATOMICADDVECTOR4x4
fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector4x4(
mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
mlir::Value a = fir::getBase(args[0]);
if (mlir::isa<fir::BaseBoxType>(a.getType()))
a = fir::BoxAddrOp::create(builder, loc, a);
const unsigned extent = 4;
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
mlir::Value ptr = builder.createConvert(loc, llvmPtrTy, a);
mlir::Type f32Ty = builder.getF32Type();
mlir::Type idxTy = builder.getIndexType();
mlir::Type refTy = fir::ReferenceType::get(f32Ty);
llvm::SmallVector<mlir::Value> values;
for (unsigned i = 0; i < extent; ++i) {
mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
fir::getBase(args[1]), pos);
mlir::Value value = fir::LoadOp::create(builder, loc, coord);
values.push_back(value);
}
auto inlinePtx = mlir::NVVM::InlinePtxOp::create(
builder, loc, {f32Ty, f32Ty, f32Ty, f32Ty},
{ptr, values[0], values[1], values[2], values[3]}, {},
"atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};", {});
llvm::SmallVector<mlir::Value> results;
results.push_back(inlinePtx.getResult(0));
results.push_back(inlinePtx.getResult(1));
results.push_back(inlinePtx.getResult(2));
results.push_back(inlinePtx.getResult(3));
mlir::Type vecF32Ty = mlir::VectorType::get({extent}, f32Ty);
mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF32Ty);
mlir::Type i32Ty = builder.getI32Type();
for (unsigned i = 0; i < extent; ++i)
undef = mlir::LLVM::InsertElementOp::create(
builder, loc, undef, results[i],
builder.createIntegerConstant(loc, i32Ty, i));
auto i128Ty = builder.getIntegerType(128);
auto i128VecTy = mlir::VectorType::get({1}, i128Ty);
mlir::Value vec128 =
mlir::vector::BitCastOp::create(builder, loc, i128VecTy, undef);
return mlir::vector::ExtractOp::create(builder, loc, vec128,
mlir::ArrayRef<int64_t>{0});
}
mlir::Value
CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
mlir::Value
CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
// ATOMICCAS
fir::ExtendedValue
CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType,
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
mlir::Value arg0 = fir::getBase(args[0]);
mlir::Value arg1 = fir::getBase(args[1]);
mlir::Value arg2 = fir::getBase(args[2]);
auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
if (mlir::isa<mlir::Float32Type>(arg.getType()))
return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
arg);
if (mlir::isa<mlir::Float64Type>(arg.getType()))
return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
arg);
return arg;
};
arg1 = bitCastFloat(arg1);
arg2 = bitCastFloat(arg2);
if (arg1.getType() != arg2.getType()) {
// arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
arg2 = builder.createConvert(loc, arg1.getType(), arg2);
}
auto address =
mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
.getResult(0);
auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
mlir::Value boolResult =
mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
return builder.createConvert(loc, resultType, boolResult);
}
mlir::Value
CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
// ATOMICEXCH
fir::ExtendedValue
CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType,
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
mlir::Value arg0 = fir::getBase(args[0]);
mlir::Value arg1 = fir::getBase(args[1]);
assert(arg1.getType().isIntOrFloat());
mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
return genAtomBinOp(builder, loc, binOp, arg0, arg1);
}
mlir::Value
CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
mlir::Value
CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
mlir::LLVM::AtomicBinOp binOp =
mlir::isa<mlir::IntegerType>(args[1].getType())
? mlir::LLVM::AtomicBinOp::max
: mlir::LLVM::AtomicBinOp::fmax;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
mlir::Value
CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
mlir::LLVM::AtomicBinOp binOp =
mlir::isa<mlir::IntegerType>(args[1].getType())
? mlir::LLVM::AtomicBinOp::min
: mlir::LLVM::AtomicBinOp::fmin;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
// ATOMICSUB
mlir::Value
CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
mlir::LLVM::AtomicBinOp binOp =
mlir::isa<mlir::IntegerType>(args[1].getType())
? mlir::LLVM::AtomicBinOp::sub
: mlir::LLVM::AtomicBinOp::fsub;
return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
}
// ATOMICXOR
fir::ExtendedValue
CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType,
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
mlir::Value arg0 = fir::getBase(args[0]);
mlir::Value arg1 = fir::getBase(args[1]);
return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
}
// BARRIER_ARRIVE
mlir::Value
CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 1);
mlir::Value barrier = convertPtrToNVVMSpace(
builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
.getResult(0);
}
// BARRIER_ARRIBVE_CNT
mlir::Value
CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
mlir::Value barrier = convertPtrToNVVMSpace(
builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
{barrier, args[1]}, {},
"mbarrier.arrive.expect_tx.release."
"cta.shared::cta.b64 %0, [%1], %2;",
{})
.getResult(0);
}
// BARRIER_INIT
void CUDAIntrinsicLibrary::genBarrierInit(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
mlir::Value barrier = convertPtrToNVVMSpace(
builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
fir::getBase(args[1]), {});
auto kind = mlir::NVVM::ProxyKindAttr::get(
builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
auto space = mlir::NVVM::SharedSpaceAttr::get(
builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
}
// BARRIER_TRY_WAIT
mlir::Value
CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
fir::StoreOp::create(builder, loc, zero, res);
mlir::Value ns =
builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
mlir::Value load = fir::LoadOp::create(builder, loc, res);
auto whileOp = mlir::scf::WhileOp::create(
builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
builder.setInsertionPointToStart(beforeBlock);
mlir::Value condition = mlir::arith::CmpIOp::create(
builder, loc, mlir::arith::CmpIPredicate::eq, beforeArg, zero);
mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
afterBlock->addArgument(resultType, loc);
builder.setInsertionPointToStart(afterBlock);
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
mlir::Value ret = mlir::NVVM::InlinePtxOp::create(
builder, loc, {resultType}, {barrier, args[1], ns}, {},
"{\n"
" .reg .pred p;\n"
" mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
" selp.b32 %0, 1, 0, p;\n"
"}",
{})
.getResult(0);
mlir::scf::YieldOp::create(builder, loc, ret);
builder.setInsertionPointAfter(whileOp);
return whileOp.getResult(0);
}
// BARRIER_TRY_WAIT_SLEEP
mlir::Value
CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 3);
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
return mlir::NVVM::InlinePtxOp::create(
builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
"{\n"
" .reg .pred p;\n"
" mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
" selp.b32 %0, 1, 0, p;\n"
"}",
{})
.getResult(0);
}
static void insertValueAtPos(fir::FirOpBuilder &builder, mlir::Location loc,
fir::RecordType recTy, mlir::Value base,
mlir::Value dim, unsigned fieldPos) {
auto fieldName = recTy.getTypeList()[fieldPos].first;
mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second;
mlir::Type fieldIndexType = fir::FieldType::get(base.getContext());
mlir::Value fieldIndex =
fir::FieldIndexOp::create(builder, loc, fieldIndexType, fieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value coord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(fieldTy), base, fieldIndex);
fir::StoreOp::create(builder, loc, dim, coord);
}
// CLUSTER_BLOCK_INDEX
mlir::Value
CUDAIntrinsicLibrary::genClusterBlockIndex(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto recTy = mlir::cast<fir::RecordType>(resultType);
assert(recTy && "RecordType expepected");
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
mlir::Type i32Ty = builder.getI32Type();
mlir::Value x = mlir::NVVM::BlockInClusterIdXOp::create(builder, loc, i32Ty);
mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
x = mlir::arith::AddIOp::create(builder, loc, x, one);
insertValueAtPos(builder, loc, recTy, res, x, 0);
mlir::Value y = mlir::NVVM::BlockInClusterIdYOp::create(builder, loc, i32Ty);
y = mlir::arith::AddIOp::create(builder, loc, y, one);
insertValueAtPos(builder, loc, recTy, res, y, 1);
mlir::Value z = mlir::NVVM::BlockInClusterIdZOp::create(builder, loc, i32Ty);
z = mlir::arith::AddIOp::create(builder, loc, z, one);
insertValueAtPos(builder, loc, recTy, res, z, 2);
return res;
}
// CLUSTER_DIM_BLOCKS
mlir::Value
CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto recTy = mlir::cast<fir::RecordType>(resultType);
assert(recTy && "RecordType expepected");
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
mlir::Type i32Ty = builder.getI32Type();
mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty);
insertValueAtPos(builder, loc, recTy, res, x, 0);
mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty);
insertValueAtPos(builder, loc, recTy, res, y, 1);
mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty);
insertValueAtPos(builder, loc, recTy, res, z, 2);
return res;
}
// CUDASETSTREAMDEFAULT
fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStream(
mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 1);
mlir::Value stream = fir::getBase(args[0]);
mlir::Type i64Ty = builder.getI64Type();
auto ctx = builder.getContext();
mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy});
auto funcOp =
builder.createFunction(loc, RTNAME_STRING(CUFSetDefaultStream), ftype);
auto call = fir::CallOp::create(builder, loc, funcOp, {stream});
return call.getResult(0);
}
// CUDASETSTREAMARRAY
fir::ExtendedValue CUDAIntrinsicLibrary::genCUDASetDefaultStreamArray(
mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 2);
mlir::Value arg = fir::getBase(args[0]);
mlir::Value stream = fir::getBase(args[1]);
if (mlir::isa<fir::BaseBoxType>(arg.getType()))
arg = fir::BoxAddrOp::create(builder, loc, arg);
mlir::Type i64Ty = builder.getI64Type();
mlir::Type i32Ty = builder.getI32Type();
auto ctx = builder.getContext();
mlir::Type voidPtrTy =
fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
mlir::FunctionType ftype =
mlir::FunctionType::get(ctx, {voidPtrTy, i64Ty}, {i32Ty});
mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, arg);
auto funcOp =
builder.createFunction(loc, RTNAME_STRING(CUFSetAssociatedStream), ftype);
auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr, stream});
return call.getResult(0);
}
// CUDASTREAMDESTROY
fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamDestroy(
mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 1);
mlir::Value stream = fir::getBase(args[0]);
mlir::Type i64Ty = builder.getI64Type();
auto ctx = builder.getContext();
mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy});
auto funcOp =
builder.createFunction(loc, RTNAME_STRING(CUFStreamDestroy), ftype);
auto call = fir::CallOp::create(builder, loc, funcOp, {stream});
return call.getResult(0);
}
// CUDASTREAMSYNCHRONIZE
fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAStreamSynchronize(
mlir::Type resTy, llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 1);
mlir::Value stream = fir::getBase(args[0]);
mlir::Type i64Ty = builder.getI64Type();
auto ctx = builder.getContext();
mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {i64Ty}, {resTy});
auto funcOp =
builder.createFunction(loc, RTNAME_STRING(CUFStreamSynchronize), ftype);
auto call = fir::CallOp::create(builder, loc, funcOp, {stream});
return call.getResult(0);
}
// CUDASTREAMSYNCHRONIZENULL
mlir::Value CUDAIntrinsicLibrary::genCUDAStreamSynchronizeNull(
mlir::Type resTy, llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto ctx = builder.getContext();
mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {}, {resTy});
auto funcOp = builder.createFunction(
loc, RTNAME_STRING(CUFStreamSynchronizeNull), ftype);
auto call = fir::CallOp::create(builder, loc, funcOp, {});
return call.getResult(0);
}
// CUDAGETDEFAULTSTREAMARG
fir::ExtendedValue CUDAIntrinsicLibrary::genCUDAGetDefaultStreamArg(
mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 1);
mlir::Value devptr = fir::getBase(args[0]);
mlir::Type i64Ty = builder.getI64Type();
auto ctx = builder.getContext();
mlir::Type voidPtrTy =
fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {voidPtrTy}, {i64Ty});
mlir::Value voidPtr = builder.createConvert(loc, voidPtrTy, devptr);
auto funcOp =
builder.createFunction(loc, RTNAME_STRING(CUFGetAssociatedStream), ftype);
auto call = fir::CallOp::create(builder, loc, funcOp, {voidPtr});
return call.getResult(0);
}
// CUDAGETDEFAULTSTREAMNULL
mlir::Value CUDAIntrinsicLibrary::genCUDAGetDefaultStreamNull(
mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
mlir::Type i64Ty = builder.getI64Type();
auto ctx = builder.getContext();
mlir::FunctionType ftype = mlir::FunctionType::get(ctx, {}, {i64Ty});
auto funcOp =
builder.createFunction(loc, RTNAME_STRING(CUFGetDefaultStream), ftype);
auto call = fir::CallOp::create(builder, loc, funcOp, {});
return call.getResult(0);
}
// FENCE_PROXY_ASYNC
void CUDAIntrinsicLibrary::genFenceProxyAsync(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 0);
auto kind = mlir::NVVM::ProxyKindAttr::get(
builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
auto space = mlir::NVVM::SharedSpaceAttr::get(
builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
}
// __LDCA, __LDCS, __LDLU, __LDCV
template <const char *fctName, int extent>
fir::ExtendedValue
CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType,
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 1);
mlir::Type resTy = fir::SequenceType::get(extent, resultType);
mlir::Value arg = fir::getBase(args[0]);
mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
if (mlir::isa<fir::BaseBoxType>(arg.getType()))
arg = fir::BoxAddrOp::create(builder, loc, arg);
mlir::Type refResTy = fir::ReferenceType::get(resTy);
mlir::FunctionType ftype =
mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
auto funcOp = builder.createFunction(loc, fctName, ftype);
llvm::SmallVector<mlir::Value> funcArgs;
funcArgs.push_back(res);
funcArgs.push_back(arg);
fir::CallOp::create(builder, loc, funcOp, funcArgs);
mlir::Value ext =
builder.createIntegerConstant(loc, builder.getIndexType(), extent);
return fir::ArrayBoxValue(res, {ext});
}
// CLOCK, CLOCK64, GLOBALTIMER
template <typename OpTy>
mlir::Value
CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0 && "expect no arguments");
return OpTy::create(builder, loc, resultType).getResult();
}
// MATCH_ALL_SYNC
mlir::Value
CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 3);
bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
mlir::Type i1Ty = builder.getI1Type();
mlir::MLIRContext *context = builder.getContext();
mlir::Value arg1 = args[1];
if (arg1.getType().isF32() || arg1.getType().isF64())
arg1 = fir::ConvertOp::create(
builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
mlir::Type retTy =
mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
auto match =
mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
mlir::NVVM::MatchSyncKind::all)
.getResult();
auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
fir::StoreOp::create(builder, loc, conv, args[2]);
return value;
}
// MATCH_ANY_SYNC
mlir::Value
CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
mlir::Value arg1 = args[1];
if (arg1.getType().isF32() || arg1.getType().isF64())
arg1 = fir::ConvertOp::create(
builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
arg1, mlir::NVVM::MatchSyncKind::any)
.getResult();
}
// SYNCTHREADS
void CUDAIntrinsicLibrary::genSyncThreads(
llvm::ArrayRef<fir::ExtendedValue> args) {
mlir::NVVM::Barrier0Op::create(builder, loc);
}
// SYNCTHREADS_AND
mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
return mlir::NVVM::BarrierOp::create(
builder, loc, resultType, {}, {},
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::AND),
arg)
.getResult(0);
}
// SYNCTHREADS_COUNT
mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
return mlir::NVVM::BarrierOp::create(
builder, loc, resultType, {}, {},
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::POPC),
arg)
.getResult(0);
}
// SYNCTHREADS_OR
mlir::Value
CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
mlir::Value arg = builder.createConvert(loc, builder.getI32Type(), args[0]);
return mlir::NVVM::BarrierOp::create(
builder, loc, resultType, {}, {},
mlir::NVVM::BarrierReductionAttr::get(
builder.getContext(), mlir::NVVM::BarrierReduction::OR),
arg)
.getResult(0);
}
// SYNCWARP
void CUDAIntrinsicLibrary::genSyncWarp(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 1);
mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0]));
}
// THIS_CLUSTER
mlir::Value
CUDAIntrinsicLibrary::genThisCluster(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto recTy = mlir::cast<fir::RecordType>(resultType);
assert(recTy && "RecordType expepected");
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
mlir::Type i32Ty = builder.getI32Type();
// SIZE
mlir::Value size = mlir::NVVM::ClusterDim::create(builder, loc, i32Ty);
auto sizeFieldName = recTy.getTypeList()[1].first;
mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, sizeFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value sizeCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
fir::StoreOp::create(builder, loc, size, sizeCoord);
// RANK
mlir::Value rank = mlir::NVVM::ClusterId::create(builder, loc, i32Ty);
mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
auto rankFieldName = recTy.getTypeList()[2].first;
mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, rankFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value rankCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
fir::StoreOp::create(builder, loc, rank, rankCoord);
return res;
}
// THIS_GRID
mlir::Value
CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto recTy = mlir::cast<fir::RecordType>(resultType);
assert(recTy && "RecordType expepected");
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
mlir::Type i32Ty = builder.getI32Type();
mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
// this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
// (blockDim.x * gridDim.x);
mlir::Value resZ =
mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
mlir::Value resY =
mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
mlir::Value resX =
mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
// tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
// blockIdx.x;
// this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
// ((threadIdx.z * blockDim.y) * blockDim.x) +
// (threadIdx.y * blockDim.x) + threadIdx.x + 1;
mlir::Value r1 =
mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
mlir::Value r3 =
mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
mlir::Value bXbY =
mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
mlir::Value bXbYbZ =
mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
mlir::Value tZbY =
mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
mlir::Value tZbYbX =
mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
mlir::Value tYbX =
mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
auto sizeFieldName = recTy.getTypeList()[1].first;
mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, sizeFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value sizeCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
fir::StoreOp::create(builder, loc, size, sizeCoord);
auto rankFieldName = recTy.getTypeList()[2].first;
mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, rankFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value rankCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
fir::StoreOp::create(builder, loc, rank, rankCoord);
return res;
}
// THIS_THREAD_BLOCK
mlir::Value
CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto recTy = mlir::cast<fir::RecordType>(resultType);
assert(recTy && "RecordType expepected");
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
mlir::Type i32Ty = builder.getI32Type();
// this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
mlir::Value size =
mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
// this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
// (threadIdx.y * blockDim.x) + threadIdx.x + 1;
mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
mlir::Value r1 =
mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
mlir::Value r3 =
mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
auto sizeFieldName = recTy.getTypeList()[1].first;
mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, sizeFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value sizeCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
fir::StoreOp::create(builder, loc, size, sizeCoord);
auto rankFieldName = recTy.getTypeList()[2].first;
mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, rankFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value rankCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
fir::StoreOp::create(builder, loc, rank, rankCoord);
return res;
}
// THIS_WARP
mlir::Value
CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 0);
auto recTy = mlir::cast<fir::RecordType>(resultType);
assert(recTy && "RecordType expepected");
mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
mlir::Type i32Ty = builder.getI32Type();
// coalesced_group%size = 32
mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
auto sizeFieldName = recTy.getTypeList()[1].first;
mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, sizeFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value sizeCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
fir::StoreOp::create(builder, loc, size, sizeCoord);
// coalesced_group%rank = threadIdx.x & 31 + 1
mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
mlir::Value masked =
mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
auto rankFieldName = recTy.getTypeList()[2].first;
mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
builder, loc, fieldIndexType, rankFieldName, recTy,
/*typeParams=*/mlir::ValueRange{});
mlir::Value rankCoord = fir::CoordinateOp::create(
builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
fir::StoreOp::create(builder, loc, rank, rankCoord);
return res;
}
// THREADFENCE, THREADFENCE_BLOCK, THREADFENCE_SYSTEM
template <mlir::NVVM::MemScopeKind scope>
void CUDAIntrinsicLibrary::genThreadFence(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 0);
mlir::NVVM::MembarOp::create(builder, loc, scope);
}
// TMA_BULK_COMMIT_GROUP
void CUDAIntrinsicLibrary::genTMABulkCommitGroup(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 0);
mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
}
// TMA_BULK_G2S
void CUDAIntrinsicLibrary::genTMABulkG2S(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value barrier = convertPtrToNVVMSpace(
builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
mlir::Value dst =
convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
mlir::NVVM::NVVMMemorySpace::SharedCluster);
mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
mlir::NVVM::NVVMMemorySpace::Global);
mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
}
static void setAlignment(mlir::Value ptr, unsigned alignment) {
if (auto declareOp = mlir::dyn_cast<hlfir::DeclareOp>(ptr.getDefiningOp()))
if (auto sharedOp = mlir::dyn_cast<cuf::SharedMemoryOp>(
declareOp.getMemref().getDefiningOp()))
sharedOp.setAlignment(alignment);
}
static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value barrier, mlir::Value src,
mlir::Value dst, mlir::Value nelem,
mlir::Value eleSize) {
mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
barrier = builder.createConvert(loc, llvmPtrTy, barrier);
setAlignment(dst, kTMAAlignment);
dst = builder.createConvert(loc, llvmPtrTy, dst);
src = builder.createConvert(loc, llvmPtrTy, src);
mlir::NVVM::InlinePtxOp::create(
builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
"[%1], %2, [%3];",
{});
mlir::NVVM::InlinePtxOp::create(
builder, loc, mlir::TypeRange{}, {barrier, size}, {},
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
}
// TMA_BULK_LOADC4
void CUDAIntrinsicLibrary::genTMABulkLoadC4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}
// TMA_BULK_LOADC8
void CUDAIntrinsicLibrary::genTMABulkLoadC8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 16);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}
// TMA_BULK_LOADI4
void CUDAIntrinsicLibrary::genTMABulkLoadI4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}
// TMA_BULK_LOADI8
void CUDAIntrinsicLibrary::genTMABulkLoadI8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}
// TMA_BULK_LOADR2
void CUDAIntrinsicLibrary::genTMABulkLoadR2(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 2);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}
// TMA_BULK_LOADR4
void CUDAIntrinsicLibrary::genTMABulkLoadR4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}
// TMA_BULK_LOADR8
void CUDAIntrinsicLibrary::genTMABulkLoadR8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
}
// TMA_BULK_S2G
void CUDAIntrinsicLibrary::genTMABulkS2G(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
mlir::NVVM::NVVMMemorySpace::Shared);
mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
mlir::NVVM::NVVMMemorySpace::Global);
mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
builder, loc, dst, src, fir::getBase(args[2]), {}, {});
mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
"cp.async.bulk.commit_group;", {});
mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
builder.getI32IntegerAttr(0), {});
}
static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value src, mlir::Value dst, mlir::Value count,
mlir::Value eleSize) {
mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
setAlignment(src, kTMAAlignment);
src = convertPtrToNVVMSpace(builder, loc, src,
mlir::NVVM::NVVMMemorySpace::Shared);
dst = convertPtrToNVVMSpace(builder, loc, dst,
mlir::NVVM::NVVMMemorySpace::Global);
mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
size, {}, {});
mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
"cp.async.bulk.commit_group;", {});
mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
builder.getI32IntegerAttr(0), {});
}
// TMA_BULK_STORE_C4
void CUDAIntrinsicLibrary::genTMABulkStoreC4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), eleSize);
}
// TMA_BULK_STORE_C8
void CUDAIntrinsicLibrary::genTMABulkStoreC8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 16);
genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), eleSize);
}
// TMA_BULK_STORE_I4
void CUDAIntrinsicLibrary::genTMABulkStoreI4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), eleSize);
}
// TMA_BULK_STORE_I8
void CUDAIntrinsicLibrary::genTMABulkStoreI8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), eleSize);
}
// TMA_BULK_STORE_R2
void CUDAIntrinsicLibrary::genTMABulkStoreR2(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 2);
genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), eleSize);
}
// TMA_BULK_STORE_R4
void CUDAIntrinsicLibrary::genTMABulkStoreR4(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 4);
genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), eleSize);
}
// TMA_BULK_STORE_R8
void CUDAIntrinsicLibrary::genTMABulkStoreR8(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 3);
mlir::Value eleSize =
builder.createIntegerConstant(loc, builder.getI32Type(), 8);
genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
fir::getBase(args[2]), eleSize);
}
// TMA_BULK_WAIT_GROUP
void CUDAIntrinsicLibrary::genTMABulkWaitGroup(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 0);
auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
}
// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
template <mlir::NVVM::VoteSyncKind kind>
mlir::Value
CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args) {
assert(args.size() == 2);
mlir::Value arg1 =
fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
? builder.getI32Type()
: builder.getI1Type();
auto voteRes =
mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
.getResult();
return fir::ConvertOp::create(builder, loc, resultType, voteRes);
}
} // namespace fir