
Note: This relands #140615 adding a ".count" suffix to the non-".all" variants. Our current intrinsic support for barrier intrinsics is confusing and incomplete, with multiple intrinsics mapping to the same instruction and intrinsic names not clearly conveying intrinsic semantics. Further, we lack support for some variants. This change unifies the IR representation to a single consistently named set of intrinsics. - llvm.nvvm.barrier.cta.sync.aligned.all(i32) - llvm.nvvm.barrier.cta.sync.aligned.count(i32, i32) - llvm.nvvm.barrier.cta.arrive.aligned.count(i32, i32) - llvm.nvvm.barrier.cta.sync.all(i32) - llvm.nvvm.barrier.cta.sync.count(i32, i32) - llvm.nvvm.barrier.cta.arrive.count(i32, i32) The following Auto-Upgrade rules are used to maintain compatibility with IR using the legacy intrinsics: * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned.count(x, y) * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync.count(x, y)
1183 lines
50 KiB
C++
1183 lines
50 KiB
C++
//===-------- NVPTX.cpp - Emit LLVM Code for builtins ---------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This contains code to emit Builtin calls as LLVM code.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "CGBuiltin.h"
|
|
#include "clang/Basic/TargetBuiltins.h"
|
|
#include "llvm/IR/IntrinsicsNVPTX.h"
|
|
|
|
using namespace clang;
|
|
using namespace CodeGen;
|
|
using namespace llvm;
|
|
|
|
namespace {
|
|
// Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
|
|
struct NVPTXMmaLdstInfo {
|
|
unsigned NumResults; // Number of elements to load/store
|
|
// Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
|
|
unsigned IID_col;
|
|
unsigned IID_row;
|
|
};
|
|
|
|
#define MMA_INTR(geom_op_type, layout) \
|
|
Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
|
|
#define MMA_LDST(n, geom_op_type) \
|
|
{ n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
|
|
|
|
static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
|
|
switch (BuiltinID) {
|
|
// FP MMA loads
|
|
case NVPTX::BI__hmma_m16n16k16_ld_a:
|
|
return MMA_LDST(8, m16n16k16_load_a_f16);
|
|
case NVPTX::BI__hmma_m16n16k16_ld_b:
|
|
return MMA_LDST(8, m16n16k16_load_b_f16);
|
|
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
|
|
return MMA_LDST(4, m16n16k16_load_c_f16);
|
|
case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
|
|
return MMA_LDST(8, m16n16k16_load_c_f32);
|
|
case NVPTX::BI__hmma_m32n8k16_ld_a:
|
|
return MMA_LDST(8, m32n8k16_load_a_f16);
|
|
case NVPTX::BI__hmma_m32n8k16_ld_b:
|
|
return MMA_LDST(8, m32n8k16_load_b_f16);
|
|
case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
|
|
return MMA_LDST(4, m32n8k16_load_c_f16);
|
|
case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
|
|
return MMA_LDST(8, m32n8k16_load_c_f32);
|
|
case NVPTX::BI__hmma_m8n32k16_ld_a:
|
|
return MMA_LDST(8, m8n32k16_load_a_f16);
|
|
case NVPTX::BI__hmma_m8n32k16_ld_b:
|
|
return MMA_LDST(8, m8n32k16_load_b_f16);
|
|
case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
|
|
return MMA_LDST(4, m8n32k16_load_c_f16);
|
|
case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
|
|
return MMA_LDST(8, m8n32k16_load_c_f32);
|
|
|
|
// Integer MMA loads
|
|
case NVPTX::BI__imma_m16n16k16_ld_a_s8:
|
|
return MMA_LDST(2, m16n16k16_load_a_s8);
|
|
case NVPTX::BI__imma_m16n16k16_ld_a_u8:
|
|
return MMA_LDST(2, m16n16k16_load_a_u8);
|
|
case NVPTX::BI__imma_m16n16k16_ld_b_s8:
|
|
return MMA_LDST(2, m16n16k16_load_b_s8);
|
|
case NVPTX::BI__imma_m16n16k16_ld_b_u8:
|
|
return MMA_LDST(2, m16n16k16_load_b_u8);
|
|
case NVPTX::BI__imma_m16n16k16_ld_c:
|
|
return MMA_LDST(8, m16n16k16_load_c_s32);
|
|
case NVPTX::BI__imma_m32n8k16_ld_a_s8:
|
|
return MMA_LDST(4, m32n8k16_load_a_s8);
|
|
case NVPTX::BI__imma_m32n8k16_ld_a_u8:
|
|
return MMA_LDST(4, m32n8k16_load_a_u8);
|
|
case NVPTX::BI__imma_m32n8k16_ld_b_s8:
|
|
return MMA_LDST(1, m32n8k16_load_b_s8);
|
|
case NVPTX::BI__imma_m32n8k16_ld_b_u8:
|
|
return MMA_LDST(1, m32n8k16_load_b_u8);
|
|
case NVPTX::BI__imma_m32n8k16_ld_c:
|
|
return MMA_LDST(8, m32n8k16_load_c_s32);
|
|
case NVPTX::BI__imma_m8n32k16_ld_a_s8:
|
|
return MMA_LDST(1, m8n32k16_load_a_s8);
|
|
case NVPTX::BI__imma_m8n32k16_ld_a_u8:
|
|
return MMA_LDST(1, m8n32k16_load_a_u8);
|
|
case NVPTX::BI__imma_m8n32k16_ld_b_s8:
|
|
return MMA_LDST(4, m8n32k16_load_b_s8);
|
|
case NVPTX::BI__imma_m8n32k16_ld_b_u8:
|
|
return MMA_LDST(4, m8n32k16_load_b_u8);
|
|
case NVPTX::BI__imma_m8n32k16_ld_c:
|
|
return MMA_LDST(8, m8n32k16_load_c_s32);
|
|
|
|
// Sub-integer MMA loads.
|
|
// Only row/col layout is supported by A/B fragments.
|
|
case NVPTX::BI__imma_m8n8k32_ld_a_s4:
|
|
return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
|
|
case NVPTX::BI__imma_m8n8k32_ld_a_u4:
|
|
return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
|
|
case NVPTX::BI__imma_m8n8k32_ld_b_s4:
|
|
return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
|
|
case NVPTX::BI__imma_m8n8k32_ld_b_u4:
|
|
return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
|
|
case NVPTX::BI__imma_m8n8k32_ld_c:
|
|
return MMA_LDST(2, m8n8k32_load_c_s32);
|
|
case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
|
|
return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
|
|
case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
|
|
return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
|
|
case NVPTX::BI__bmma_m8n8k128_ld_c:
|
|
return MMA_LDST(2, m8n8k128_load_c_s32);
|
|
|
|
// Double MMA loads
|
|
case NVPTX::BI__dmma_m8n8k4_ld_a:
|
|
return MMA_LDST(1, m8n8k4_load_a_f64);
|
|
case NVPTX::BI__dmma_m8n8k4_ld_b:
|
|
return MMA_LDST(1, m8n8k4_load_b_f64);
|
|
case NVPTX::BI__dmma_m8n8k4_ld_c:
|
|
return MMA_LDST(2, m8n8k4_load_c_f64);
|
|
|
|
// Alternate float MMA loads
|
|
case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
|
|
return MMA_LDST(4, m16n16k16_load_a_bf16);
|
|
case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
|
|
return MMA_LDST(4, m16n16k16_load_b_bf16);
|
|
case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
|
|
return MMA_LDST(2, m8n32k16_load_a_bf16);
|
|
case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
|
|
return MMA_LDST(8, m8n32k16_load_b_bf16);
|
|
case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
|
|
return MMA_LDST(8, m32n8k16_load_a_bf16);
|
|
case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
|
|
return MMA_LDST(2, m32n8k16_load_b_bf16);
|
|
case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
|
|
return MMA_LDST(4, m16n16k8_load_a_tf32);
|
|
case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
|
|
return MMA_LDST(4, m16n16k8_load_b_tf32);
|
|
case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
|
|
return MMA_LDST(8, m16n16k8_load_c_f32);
|
|
|
|
// NOTE: We need to follow inconsitent naming scheme used by NVCC. Unlike
|
|
// PTX and LLVM IR where stores always use fragment D, NVCC builtins always
|
|
// use fragment C for both loads and stores.
|
|
// FP MMA stores.
|
|
case NVPTX::BI__hmma_m16n16k16_st_c_f16:
|
|
return MMA_LDST(4, m16n16k16_store_d_f16);
|
|
case NVPTX::BI__hmma_m16n16k16_st_c_f32:
|
|
return MMA_LDST(8, m16n16k16_store_d_f32);
|
|
case NVPTX::BI__hmma_m32n8k16_st_c_f16:
|
|
return MMA_LDST(4, m32n8k16_store_d_f16);
|
|
case NVPTX::BI__hmma_m32n8k16_st_c_f32:
|
|
return MMA_LDST(8, m32n8k16_store_d_f32);
|
|
case NVPTX::BI__hmma_m8n32k16_st_c_f16:
|
|
return MMA_LDST(4, m8n32k16_store_d_f16);
|
|
case NVPTX::BI__hmma_m8n32k16_st_c_f32:
|
|
return MMA_LDST(8, m8n32k16_store_d_f32);
|
|
|
|
// Integer and sub-integer MMA stores.
|
|
// Another naming quirk. Unlike other MMA builtins that use PTX types in the
|
|
// name, integer loads/stores use LLVM's i32.
|
|
case NVPTX::BI__imma_m16n16k16_st_c_i32:
|
|
return MMA_LDST(8, m16n16k16_store_d_s32);
|
|
case NVPTX::BI__imma_m32n8k16_st_c_i32:
|
|
return MMA_LDST(8, m32n8k16_store_d_s32);
|
|
case NVPTX::BI__imma_m8n32k16_st_c_i32:
|
|
return MMA_LDST(8, m8n32k16_store_d_s32);
|
|
case NVPTX::BI__imma_m8n8k32_st_c_i32:
|
|
return MMA_LDST(2, m8n8k32_store_d_s32);
|
|
case NVPTX::BI__bmma_m8n8k128_st_c_i32:
|
|
return MMA_LDST(2, m8n8k128_store_d_s32);
|
|
|
|
// Double MMA store
|
|
case NVPTX::BI__dmma_m8n8k4_st_c_f64:
|
|
return MMA_LDST(2, m8n8k4_store_d_f64);
|
|
|
|
// Alternate float MMA store
|
|
case NVPTX::BI__mma_m16n16k8_st_c_f32:
|
|
return MMA_LDST(8, m16n16k8_store_d_f32);
|
|
|
|
default:
|
|
llvm_unreachable("Unknown MMA builtin");
|
|
}
|
|
}
|
|
#undef MMA_LDST
|
|
#undef MMA_INTR
|
|
|
|
|
|
struct NVPTXMmaInfo {
|
|
unsigned NumEltsA;
|
|
unsigned NumEltsB;
|
|
unsigned NumEltsC;
|
|
unsigned NumEltsD;
|
|
|
|
// Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
|
|
// over 'col' for layout. The index of non-satf variants is expected to match
|
|
// the undocumented layout constants used by CUDA's mma.hpp.
|
|
std::array<unsigned, 8> Variants;
|
|
|
|
unsigned getMMAIntrinsic(int Layout, bool Satf) {
|
|
unsigned Index = Layout + 4 * Satf;
|
|
if (Index >= Variants.size())
|
|
return 0;
|
|
return Variants[Index];
|
|
}
|
|
};
|
|
|
|
// Returns an intrinsic that matches Layout and Satf for valid combinations of
|
|
// Layout and Satf, 0 otherwise.
|
|
static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
|
|
// clang-format off
|
|
#define MMA_VARIANTS(geom, type) \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
|
|
#define MMA_SATF_VARIANTS(geom, type) \
|
|
MMA_VARIANTS(geom, type), \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
|
|
// Sub-integer MMA only supports row.col layout.
|
|
#define MMA_VARIANTS_I4(geom, type) \
|
|
0, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
|
|
0, \
|
|
0, \
|
|
0, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
|
|
0, \
|
|
0
|
|
// b1 MMA does not support .satfinite.
|
|
#define MMA_VARIANTS_B1_XOR(geom, type) \
|
|
0, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type, \
|
|
0, \
|
|
0, \
|
|
0, \
|
|
0, \
|
|
0, \
|
|
0
|
|
#define MMA_VARIANTS_B1_AND(geom, type) \
|
|
0, \
|
|
Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type, \
|
|
0, \
|
|
0, \
|
|
0, \
|
|
0, \
|
|
0, \
|
|
0
|
|
// clang-format on
|
|
switch (BuiltinID) {
|
|
// FP MMA
|
|
// Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
|
|
// NumEltsN of return value are ordered as A,B,C,D.
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
|
|
return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
|
|
return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
|
|
return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
|
|
return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
|
|
return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
|
|
return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
|
|
return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
|
|
return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
|
|
return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
|
|
return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
|
|
return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
|
|
return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
|
|
|
|
// Integer MMA
|
|
case NVPTX::BI__imma_m16n16k16_mma_s8:
|
|
return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}};
|
|
case NVPTX::BI__imma_m16n16k16_mma_u8:
|
|
return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}};
|
|
case NVPTX::BI__imma_m32n8k16_mma_s8:
|
|
return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}};
|
|
case NVPTX::BI__imma_m32n8k16_mma_u8:
|
|
return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}};
|
|
case NVPTX::BI__imma_m8n32k16_mma_s8:
|
|
return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}};
|
|
case NVPTX::BI__imma_m8n32k16_mma_u8:
|
|
return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}};
|
|
|
|
// Sub-integer MMA
|
|
case NVPTX::BI__imma_m8n8k32_mma_s4:
|
|
return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}};
|
|
case NVPTX::BI__imma_m8n8k32_mma_u4:
|
|
return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}};
|
|
case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
|
|
return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
|
|
case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
|
|
return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
|
|
|
|
// Double MMA
|
|
case NVPTX::BI__dmma_m8n8k4_mma_f64:
|
|
return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}};
|
|
|
|
// Alternate FP MMA
|
|
case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
|
|
return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}};
|
|
case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
|
|
return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}};
|
|
case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
|
|
return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}};
|
|
case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
|
|
return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}};
|
|
default:
|
|
llvm_unreachable("Unexpected builtin ID.");
|
|
}
|
|
#undef MMA_VARIANTS
|
|
#undef MMA_SATF_VARIANTS
|
|
#undef MMA_VARIANTS_I4
|
|
#undef MMA_VARIANTS_B1_AND
|
|
#undef MMA_VARIANTS_B1_XOR
|
|
}
|
|
|
|
static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
|
|
const CallExpr *E) {
|
|
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
|
|
QualType ArgType = E->getArg(0)->getType();
|
|
clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
|
|
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
|
|
return CGF.Builder.CreateCall(
|
|
CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
|
|
{Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
|
|
}
|
|
|
|
static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
|
|
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
|
|
QualType ArgType = E->getArg(0)->getType();
|
|
clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
|
|
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
|
|
|
|
// Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
|
|
auto *ASC = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.Builder.getPtrTy(1));
|
|
auto *LD = CGF.Builder.CreateAlignedLoad(ElemTy, ASC, AlignV.getAsAlign());
|
|
MDNode *MD = MDNode::get(CGF.Builder.getContext(), {});
|
|
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
|
|
return LD;
|
|
}
|
|
|
|
static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
|
|
const CallExpr *E) {
|
|
Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
|
|
llvm::Type *ElemTy =
|
|
CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
|
|
return CGF.Builder.CreateCall(
|
|
CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
|
|
{Ptr, CGF.EmitScalarExpr(E->getArg(1))});
|
|
}
|
|
|
|
static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
|
|
CodeGenFunction &CGF, const CallExpr *E,
|
|
int SrcSize) {
|
|
return E->getNumArgs() == 3
|
|
? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS),
|
|
{CGF.EmitScalarExpr(E->getArg(0)),
|
|
CGF.EmitScalarExpr(E->getArg(1)),
|
|
CGF.EmitScalarExpr(E->getArg(2))})
|
|
: CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
|
|
{CGF.EmitScalarExpr(E->getArg(0)),
|
|
CGF.EmitScalarExpr(E->getArg(1))});
|
|
}
|
|
|
|
static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
|
|
const CallExpr *E, CodeGenFunction &CGF) {
|
|
auto &C = CGF.CGM.getContext();
|
|
if (!(C.getLangOpts().NativeHalfType ||
|
|
!C.getTargetInfo().useFP16ConversionIntrinsics())) {
|
|
CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getQuotedName(BuiltinID) +
|
|
" requires native half type support.");
|
|
return nullptr;
|
|
}
|
|
|
|
if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
|
|
return MakeLdg(CGF, E);
|
|
|
|
if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
|
|
return MakeLdu(IntrinsicID, CGF, E);
|
|
|
|
SmallVector<Value *, 16> Args;
|
|
auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
|
|
auto *FTy = F->getFunctionType();
|
|
unsigned ICEArguments = 0;
|
|
ASTContext::GetBuiltinTypeError Error;
|
|
C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
|
|
assert(Error == ASTContext::GE_None && "Should not codegen an error");
|
|
for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
|
|
assert((ICEArguments & (1 << i)) == 0);
|
|
auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i));
|
|
auto *PTy = FTy->getParamType(i);
|
|
if (PTy != ArgValue->getType())
|
|
ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy);
|
|
Args.push_back(ArgValue);
|
|
}
|
|
|
|
return CGF.Builder.CreateCall(F, Args);
|
|
}
|
|
} // namespace
|
|
|
|
Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
|
|
const CallExpr *E) {
|
|
switch (BuiltinID) {
|
|
case NVPTX::BI__nvvm_atom_add_gen_i:
|
|
case NVPTX::BI__nvvm_atom_add_gen_l:
|
|
case NVPTX::BI__nvvm_atom_add_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_sub_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sub_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sub_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_and_gen_i:
|
|
case NVPTX::BI__nvvm_atom_and_gen_l:
|
|
case NVPTX::BI__nvvm_atom_and_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_or_gen_i:
|
|
case NVPTX::BI__nvvm_atom_or_gen_l:
|
|
case NVPTX::BI__nvvm_atom_or_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_xor_gen_i:
|
|
case NVPTX::BI__nvvm_atom_xor_gen_l:
|
|
case NVPTX::BI__nvvm_atom_xor_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_xchg_gen_i:
|
|
case NVPTX::BI__nvvm_atom_xchg_gen_l:
|
|
case NVPTX::BI__nvvm_atom_xchg_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_max_gen_i:
|
|
case NVPTX::BI__nvvm_atom_max_gen_l:
|
|
case NVPTX::BI__nvvm_atom_max_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_max_gen_ui:
|
|
case NVPTX::BI__nvvm_atom_max_gen_ul:
|
|
case NVPTX::BI__nvvm_atom_max_gen_ull:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_min_gen_i:
|
|
case NVPTX::BI__nvvm_atom_min_gen_l:
|
|
case NVPTX::BI__nvvm_atom_min_gen_ll:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_min_gen_ui:
|
|
case NVPTX::BI__nvvm_atom_min_gen_ul:
|
|
case NVPTX::BI__nvvm_atom_min_gen_ull:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_cas_gen_us:
|
|
case NVPTX::BI__nvvm_atom_cas_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cas_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cas_gen_ll:
|
|
// __nvvm_atom_cas_gen_* should return the old value rather than the
|
|
// success flag.
|
|
return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
|
|
|
|
case NVPTX::BI__nvvm_atom_add_gen_f:
|
|
case NVPTX::BI__nvvm_atom_add_gen_d: {
|
|
Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
|
|
Value *Val = EmitScalarExpr(E->getArg(1));
|
|
|
|
return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val,
|
|
AtomicOrdering::SequentiallyConsistent);
|
|
}
|
|
|
|
case NVPTX::BI__nvvm_atom_inc_gen_ui:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_dec_gen_ui:
|
|
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E);
|
|
|
|
case NVPTX::BI__nvvm_ldg_c:
|
|
case NVPTX::BI__nvvm_ldg_sc:
|
|
case NVPTX::BI__nvvm_ldg_c2:
|
|
case NVPTX::BI__nvvm_ldg_sc2:
|
|
case NVPTX::BI__nvvm_ldg_c4:
|
|
case NVPTX::BI__nvvm_ldg_sc4:
|
|
case NVPTX::BI__nvvm_ldg_s:
|
|
case NVPTX::BI__nvvm_ldg_s2:
|
|
case NVPTX::BI__nvvm_ldg_s4:
|
|
case NVPTX::BI__nvvm_ldg_i:
|
|
case NVPTX::BI__nvvm_ldg_i2:
|
|
case NVPTX::BI__nvvm_ldg_i4:
|
|
case NVPTX::BI__nvvm_ldg_l:
|
|
case NVPTX::BI__nvvm_ldg_l2:
|
|
case NVPTX::BI__nvvm_ldg_ll:
|
|
case NVPTX::BI__nvvm_ldg_ll2:
|
|
case NVPTX::BI__nvvm_ldg_uc:
|
|
case NVPTX::BI__nvvm_ldg_uc2:
|
|
case NVPTX::BI__nvvm_ldg_uc4:
|
|
case NVPTX::BI__nvvm_ldg_us:
|
|
case NVPTX::BI__nvvm_ldg_us2:
|
|
case NVPTX::BI__nvvm_ldg_us4:
|
|
case NVPTX::BI__nvvm_ldg_ui:
|
|
case NVPTX::BI__nvvm_ldg_ui2:
|
|
case NVPTX::BI__nvvm_ldg_ui4:
|
|
case NVPTX::BI__nvvm_ldg_ul:
|
|
case NVPTX::BI__nvvm_ldg_ul2:
|
|
case NVPTX::BI__nvvm_ldg_ull:
|
|
case NVPTX::BI__nvvm_ldg_ull2:
|
|
case NVPTX::BI__nvvm_ldg_f:
|
|
case NVPTX::BI__nvvm_ldg_f2:
|
|
case NVPTX::BI__nvvm_ldg_f4:
|
|
case NVPTX::BI__nvvm_ldg_d:
|
|
case NVPTX::BI__nvvm_ldg_d2:
|
|
// PTX Interoperability section 2.2: "For a vector with an even number of
|
|
// elements, its alignment is set to number of elements times the alignment
|
|
// of its member: n*alignof(t)."
|
|
return MakeLdg(*this, E);
|
|
|
|
case NVPTX::BI__nvvm_ldu_c:
|
|
case NVPTX::BI__nvvm_ldu_sc:
|
|
case NVPTX::BI__nvvm_ldu_c2:
|
|
case NVPTX::BI__nvvm_ldu_sc2:
|
|
case NVPTX::BI__nvvm_ldu_c4:
|
|
case NVPTX::BI__nvvm_ldu_sc4:
|
|
case NVPTX::BI__nvvm_ldu_s:
|
|
case NVPTX::BI__nvvm_ldu_s2:
|
|
case NVPTX::BI__nvvm_ldu_s4:
|
|
case NVPTX::BI__nvvm_ldu_i:
|
|
case NVPTX::BI__nvvm_ldu_i2:
|
|
case NVPTX::BI__nvvm_ldu_i4:
|
|
case NVPTX::BI__nvvm_ldu_l:
|
|
case NVPTX::BI__nvvm_ldu_l2:
|
|
case NVPTX::BI__nvvm_ldu_ll:
|
|
case NVPTX::BI__nvvm_ldu_ll2:
|
|
case NVPTX::BI__nvvm_ldu_uc:
|
|
case NVPTX::BI__nvvm_ldu_uc2:
|
|
case NVPTX::BI__nvvm_ldu_uc4:
|
|
case NVPTX::BI__nvvm_ldu_us:
|
|
case NVPTX::BI__nvvm_ldu_us2:
|
|
case NVPTX::BI__nvvm_ldu_us4:
|
|
case NVPTX::BI__nvvm_ldu_ui:
|
|
case NVPTX::BI__nvvm_ldu_ui2:
|
|
case NVPTX::BI__nvvm_ldu_ui4:
|
|
case NVPTX::BI__nvvm_ldu_ul:
|
|
case NVPTX::BI__nvvm_ldu_ul2:
|
|
case NVPTX::BI__nvvm_ldu_ull:
|
|
case NVPTX::BI__nvvm_ldu_ull2:
|
|
return MakeLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
|
|
case NVPTX::BI__nvvm_ldu_f:
|
|
case NVPTX::BI__nvvm_ldu_f2:
|
|
case NVPTX::BI__nvvm_ldu_f4:
|
|
case NVPTX::BI__nvvm_ldu_d:
|
|
case NVPTX::BI__nvvm_ldu_d2:
|
|
return MakeLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
|
|
|
|
case NVPTX::BI__nvvm_atom_cta_add_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_add_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_add_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_add_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_add_gen_f:
|
|
case NVPTX::BI__nvvm_atom_cta_add_gen_d:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_add_gen_f:
|
|
case NVPTX::BI__nvvm_atom_sys_add_gen_d:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_max_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
|
|
case NVPTX::BI__nvvm_atom_cta_max_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
|
|
case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
|
|
case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_max_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
|
|
case NVPTX::BI__nvvm_atom_sys_max_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
|
|
case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
|
|
case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_min_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
|
|
case NVPTX::BI__nvvm_atom_cta_min_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
|
|
case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
|
|
case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_min_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
|
|
case NVPTX::BI__nvvm_atom_sys_min_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
|
|
case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
|
|
case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_and_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_and_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_and_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_and_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_or_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_or_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_or_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_or_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E);
|
|
case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
|
|
return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
|
|
case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
|
|
case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
|
|
case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
|
|
case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
|
|
Value *Ptr = EmitScalarExpr(E->getArg(0));
|
|
llvm::Type *ElemTy =
|
|
ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(
|
|
Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
|
|
{Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
|
|
}
|
|
case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
|
|
case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
|
|
case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
|
|
case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
|
|
Value *Ptr = EmitScalarExpr(E->getArg(0));
|
|
llvm::Type *ElemTy =
|
|
ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(
|
|
Intrinsic::nvvm_atomic_cas_gen_i_sys, {ElemTy, Ptr->getType()}),
|
|
{Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
|
|
}
|
|
case NVPTX::BI__nvvm_match_all_sync_i32p:
|
|
case NVPTX::BI__nvvm_match_all_sync_i64p: {
|
|
Value *Mask = EmitScalarExpr(E->getArg(0));
|
|
Value *Val = EmitScalarExpr(E->getArg(1));
|
|
Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
|
|
Value *ResultPair = Builder.CreateCall(
|
|
CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
|
|
? Intrinsic::nvvm_match_all_sync_i32p
|
|
: Intrinsic::nvvm_match_all_sync_i64p),
|
|
{Mask, Val});
|
|
Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
|
|
PredOutPtr.getElementType());
|
|
Builder.CreateStore(Pred, PredOutPtr);
|
|
return Builder.CreateExtractValue(ResultPair, 0);
|
|
}
|
|
|
|
// FP MMA loads
|
|
case NVPTX::BI__hmma_m16n16k16_ld_a:
|
|
case NVPTX::BI__hmma_m16n16k16_ld_b:
|
|
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
|
|
case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
|
|
case NVPTX::BI__hmma_m32n8k16_ld_a:
|
|
case NVPTX::BI__hmma_m32n8k16_ld_b:
|
|
case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
|
|
case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
|
|
case NVPTX::BI__hmma_m8n32k16_ld_a:
|
|
case NVPTX::BI__hmma_m8n32k16_ld_b:
|
|
case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
|
|
case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
|
|
// Integer MMA loads.
|
|
case NVPTX::BI__imma_m16n16k16_ld_a_s8:
|
|
case NVPTX::BI__imma_m16n16k16_ld_a_u8:
|
|
case NVPTX::BI__imma_m16n16k16_ld_b_s8:
|
|
case NVPTX::BI__imma_m16n16k16_ld_b_u8:
|
|
case NVPTX::BI__imma_m16n16k16_ld_c:
|
|
case NVPTX::BI__imma_m32n8k16_ld_a_s8:
|
|
case NVPTX::BI__imma_m32n8k16_ld_a_u8:
|
|
case NVPTX::BI__imma_m32n8k16_ld_b_s8:
|
|
case NVPTX::BI__imma_m32n8k16_ld_b_u8:
|
|
case NVPTX::BI__imma_m32n8k16_ld_c:
|
|
case NVPTX::BI__imma_m8n32k16_ld_a_s8:
|
|
case NVPTX::BI__imma_m8n32k16_ld_a_u8:
|
|
case NVPTX::BI__imma_m8n32k16_ld_b_s8:
|
|
case NVPTX::BI__imma_m8n32k16_ld_b_u8:
|
|
case NVPTX::BI__imma_m8n32k16_ld_c:
|
|
// Sub-integer MMA loads.
|
|
case NVPTX::BI__imma_m8n8k32_ld_a_s4:
|
|
case NVPTX::BI__imma_m8n8k32_ld_a_u4:
|
|
case NVPTX::BI__imma_m8n8k32_ld_b_s4:
|
|
case NVPTX::BI__imma_m8n8k32_ld_b_u4:
|
|
case NVPTX::BI__imma_m8n8k32_ld_c:
|
|
case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
|
|
case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
|
|
case NVPTX::BI__bmma_m8n8k128_ld_c:
|
|
// Double MMA loads.
|
|
case NVPTX::BI__dmma_m8n8k4_ld_a:
|
|
case NVPTX::BI__dmma_m8n8k4_ld_b:
|
|
case NVPTX::BI__dmma_m8n8k4_ld_c:
|
|
// Alternate float MMA loads.
|
|
case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
|
|
case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
|
|
case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
|
|
case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
|
|
case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
|
|
case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
|
|
case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
|
|
case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
|
|
case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
|
|
Address Dst = EmitPointerWithAlignment(E->getArg(0));
|
|
Value *Src = EmitScalarExpr(E->getArg(1));
|
|
Value *Ldm = EmitScalarExpr(E->getArg(2));
|
|
std::optional<llvm::APSInt> isColMajorArg =
|
|
E->getArg(3)->getIntegerConstantExpr(getContext());
|
|
if (!isColMajorArg)
|
|
return nullptr;
|
|
bool isColMajor = isColMajorArg->getSExtValue();
|
|
NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
|
|
unsigned IID = isColMajor ? II.IID_col : II.IID_row;
|
|
if (IID == 0)
|
|
return nullptr;
|
|
|
|
Value *Result =
|
|
Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
|
|
|
|
// Save returned values.
|
|
assert(II.NumResults);
|
|
if (II.NumResults == 1) {
|
|
Builder.CreateAlignedStore(Result, Dst.emitRawPointer(*this),
|
|
CharUnits::fromQuantity(4));
|
|
} else {
|
|
for (unsigned i = 0; i < II.NumResults; ++i) {
|
|
Builder.CreateAlignedStore(
|
|
Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
|
|
Dst.getElementType()),
|
|
Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this),
|
|
llvm::ConstantInt::get(IntTy, i)),
|
|
CharUnits::fromQuantity(4));
|
|
}
|
|
}
|
|
return Result;
|
|
}
|
|
|
|
case NVPTX::BI__hmma_m16n16k16_st_c_f16:
|
|
case NVPTX::BI__hmma_m16n16k16_st_c_f32:
|
|
case NVPTX::BI__hmma_m32n8k16_st_c_f16:
|
|
case NVPTX::BI__hmma_m32n8k16_st_c_f32:
|
|
case NVPTX::BI__hmma_m8n32k16_st_c_f16:
|
|
case NVPTX::BI__hmma_m8n32k16_st_c_f32:
|
|
case NVPTX::BI__imma_m16n16k16_st_c_i32:
|
|
case NVPTX::BI__imma_m32n8k16_st_c_i32:
|
|
case NVPTX::BI__imma_m8n32k16_st_c_i32:
|
|
case NVPTX::BI__imma_m8n8k32_st_c_i32:
|
|
case NVPTX::BI__bmma_m8n8k128_st_c_i32:
|
|
case NVPTX::BI__dmma_m8n8k4_st_c_f64:
|
|
case NVPTX::BI__mma_m16n16k8_st_c_f32: {
|
|
Value *Dst = EmitScalarExpr(E->getArg(0));
|
|
Address Src = EmitPointerWithAlignment(E->getArg(1));
|
|
Value *Ldm = EmitScalarExpr(E->getArg(2));
|
|
std::optional<llvm::APSInt> isColMajorArg =
|
|
E->getArg(3)->getIntegerConstantExpr(getContext());
|
|
if (!isColMajorArg)
|
|
return nullptr;
|
|
bool isColMajor = isColMajorArg->getSExtValue();
|
|
NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
|
|
unsigned IID = isColMajor ? II.IID_col : II.IID_row;
|
|
if (IID == 0)
|
|
return nullptr;
|
|
Function *Intrinsic =
|
|
CGM.getIntrinsic(IID, Dst->getType());
|
|
llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
|
|
SmallVector<Value *, 10> Values = {Dst};
|
|
for (unsigned i = 0; i < II.NumResults; ++i) {
|
|
Value *V = Builder.CreateAlignedLoad(
|
|
Src.getElementType(),
|
|
Builder.CreateGEP(Src.getElementType(), Src.emitRawPointer(*this),
|
|
llvm::ConstantInt::get(IntTy, i)),
|
|
CharUnits::fromQuantity(4));
|
|
Values.push_back(Builder.CreateBitCast(V, ParamType));
|
|
}
|
|
Values.push_back(Ldm);
|
|
Value *Result = Builder.CreateCall(Intrinsic, Values);
|
|
return Result;
|
|
}
|
|
|
|
// BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
|
|
// Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
|
|
case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
|
|
case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
|
|
case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
|
|
case NVPTX::BI__imma_m16n16k16_mma_s8:
|
|
case NVPTX::BI__imma_m16n16k16_mma_u8:
|
|
case NVPTX::BI__imma_m32n8k16_mma_s8:
|
|
case NVPTX::BI__imma_m32n8k16_mma_u8:
|
|
case NVPTX::BI__imma_m8n32k16_mma_s8:
|
|
case NVPTX::BI__imma_m8n32k16_mma_u8:
|
|
case NVPTX::BI__imma_m8n8k32_mma_s4:
|
|
case NVPTX::BI__imma_m8n8k32_mma_u4:
|
|
case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
|
|
case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
|
|
case NVPTX::BI__dmma_m8n8k4_mma_f64:
|
|
case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
|
|
case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
|
|
case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
|
|
case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
|
|
Address Dst = EmitPointerWithAlignment(E->getArg(0));
|
|
Address SrcA = EmitPointerWithAlignment(E->getArg(1));
|
|
Address SrcB = EmitPointerWithAlignment(E->getArg(2));
|
|
Address SrcC = EmitPointerWithAlignment(E->getArg(3));
|
|
std::optional<llvm::APSInt> LayoutArg =
|
|
E->getArg(4)->getIntegerConstantExpr(getContext());
|
|
if (!LayoutArg)
|
|
return nullptr;
|
|
int Layout = LayoutArg->getSExtValue();
|
|
if (Layout < 0 || Layout > 3)
|
|
return nullptr;
|
|
llvm::APSInt SatfArg;
|
|
if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
|
|
BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
|
|
SatfArg = 0; // .b1 does not have satf argument.
|
|
else if (std::optional<llvm::APSInt> OptSatfArg =
|
|
E->getArg(5)->getIntegerConstantExpr(getContext()))
|
|
SatfArg = *OptSatfArg;
|
|
else
|
|
return nullptr;
|
|
bool Satf = SatfArg.getSExtValue();
|
|
NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
|
|
unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
|
|
if (IID == 0) // Unsupported combination of Layout/Satf.
|
|
return nullptr;
|
|
|
|
SmallVector<Value *, 24> Values;
|
|
Function *Intrinsic = CGM.getIntrinsic(IID);
|
|
llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
|
|
// Load A
|
|
for (unsigned i = 0; i < MI.NumEltsA; ++i) {
|
|
Value *V = Builder.CreateAlignedLoad(
|
|
SrcA.getElementType(),
|
|
Builder.CreateGEP(SrcA.getElementType(), SrcA.emitRawPointer(*this),
|
|
llvm::ConstantInt::get(IntTy, i)),
|
|
CharUnits::fromQuantity(4));
|
|
Values.push_back(Builder.CreateBitCast(V, AType));
|
|
}
|
|
// Load B
|
|
llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
|
|
for (unsigned i = 0; i < MI.NumEltsB; ++i) {
|
|
Value *V = Builder.CreateAlignedLoad(
|
|
SrcB.getElementType(),
|
|
Builder.CreateGEP(SrcB.getElementType(), SrcB.emitRawPointer(*this),
|
|
llvm::ConstantInt::get(IntTy, i)),
|
|
CharUnits::fromQuantity(4));
|
|
Values.push_back(Builder.CreateBitCast(V, BType));
|
|
}
|
|
// Load C
|
|
llvm::Type *CType =
|
|
Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
|
|
for (unsigned i = 0; i < MI.NumEltsC; ++i) {
|
|
Value *V = Builder.CreateAlignedLoad(
|
|
SrcC.getElementType(),
|
|
Builder.CreateGEP(SrcC.getElementType(), SrcC.emitRawPointer(*this),
|
|
llvm::ConstantInt::get(IntTy, i)),
|
|
CharUnits::fromQuantity(4));
|
|
Values.push_back(Builder.CreateBitCast(V, CType));
|
|
}
|
|
Value *Result = Builder.CreateCall(Intrinsic, Values);
|
|
llvm::Type *DType = Dst.getElementType();
|
|
for (unsigned i = 0; i < MI.NumEltsD; ++i)
|
|
Builder.CreateAlignedStore(
|
|
Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
|
|
Builder.CreateGEP(Dst.getElementType(), Dst.emitRawPointer(*this),
|
|
llvm::ConstantInt::get(IntTy, i)),
|
|
CharUnits::fromQuantity(4));
|
|
return Result;
|
|
}
|
|
// The following builtins require half type support
|
|
case NVPTX::BI__nvvm_ex2_approx_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_ex2_approx_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_ff2f16x2_rn:
|
|
return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
|
|
return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_ff2f16x2_rz:
|
|
return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_ff2f16x2_rz_relu:
|
|
return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_ftz_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_ftz_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fma_rn_relu_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_relu_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_sat_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fma_rn_sat_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_nan_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID,
|
|
E, *this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2,
|
|
BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID,
|
|
E, *this);
|
|
case NVPTX::BI__nvvm_fmax_nan_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_nan_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID,
|
|
E, *this);
|
|
case NVPTX::BI__nvvm_fmax_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmin_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_nan_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID,
|
|
E, *this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
|
|
BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID,
|
|
E, *this);
|
|
case NVPTX::BI__nvvm_fmin_nan_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_nan_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID,
|
|
E, *this);
|
|
case NVPTX::BI__nvvm_fmin_xorsign_abs_f16:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
|
|
return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
|
|
*this);
|
|
case NVPTX::BI__nvvm_fabs_f:
|
|
case NVPTX::BI__nvvm_abs_bf16:
|
|
case NVPTX::BI__nvvm_abs_bf16x2:
|
|
case NVPTX::BI__nvvm_fabs_f16:
|
|
case NVPTX::BI__nvvm_fabs_f16x2:
|
|
return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs,
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__nvvm_fabs_ftz_f:
|
|
case NVPTX::BI__nvvm_fabs_ftz_f16:
|
|
case NVPTX::BI__nvvm_fabs_ftz_f16x2:
|
|
return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs_ftz,
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__nvvm_fabs_d:
|
|
return Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__nvvm_ldg_h:
|
|
case NVPTX::BI__nvvm_ldg_h2:
|
|
return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_ldu_h:
|
|
case NVPTX::BI__nvvm_ldu_h2:
|
|
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
|
|
case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
|
|
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
|
|
Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
|
|
4);
|
|
case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
|
|
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8,
|
|
Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E,
|
|
8);
|
|
case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
|
|
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16,
|
|
Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E,
|
|
16);
|
|
case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
|
|
return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16,
|
|
Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E,
|
|
16);
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_x));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_y));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_z));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_w));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_x));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_y));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_z));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_w));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank));
|
|
case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank));
|
|
case NVPTX::BI__nvvm_is_explicit_cluster:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_is_explicit_cluster));
|
|
case NVPTX::BI__nvvm_isspacep_shared_cluster:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_isspacep_shared_cluster),
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__nvvm_mapa:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_mapa),
|
|
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
|
|
case NVPTX::BI__nvvm_mapa_shared_cluster:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_mapa_shared_cluster),
|
|
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
|
|
case NVPTX::BI__nvvm_getctarank:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_getctarank),
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__nvvm_getctarank_shared_cluster:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__nvvm_barrier_cluster_arrive:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
|
|
case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
|
|
case NVPTX::BI__nvvm_barrier_cluster_wait:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
|
|
case NVPTX::BI__nvvm_fence_sc_cluster:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
|
|
case NVPTX::BI__nvvm_bar_sync:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__syncthreads:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
|
|
Builder.getInt32(0));
|
|
case NVPTX::BI__nvvm_barrier_sync:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
|
|
EmitScalarExpr(E->getArg(0)));
|
|
case NVPTX::BI__nvvm_barrier_sync_cnt:
|
|
return Builder.CreateCall(
|
|
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count),
|
|
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
|
|
default:
|
|
return nullptr;
|
|
}
|
|
}
|