[NVPTX] Support intrinsics for reserved shared memory special registers (#182354)

Added reserved_smem_offset_{begin|end|cap|0} intrinsics to expose shared
memory special registers and NVPTX TableGen support for these
intrinsics.
This commit is contained in:
yasmincs 2026-02-26 14:34:59 -08:00 committed by GitHub
parent fb81e59ccb
commit dbaa4d117a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 103 additions and 4 deletions

View File

@ -300,6 +300,42 @@ including the portion reserved for system software use.
The '``dynamic_smem_size``' variant returns the amount of dynamic shared
memory allocated per CTA for the kernel at launch time.
'``llvm.nvvm.read.ptx.sreg.reserved_smem_offset_*``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
.. code-block:: llvm
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_begin()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_end()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_cap()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_0()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_1()
Overview:
"""""""""
The '``@llvm.nvvm.read.ptx.sreg.reserved_smem_offset_*``' intrinsics provide
access to PTX special registers that hold information about reserved shared
memory offsets.
The '``reserved_smem_offset_begin``' intrinsic reads the start offset of
the reserved shared memory region.
The '``reserved_smem_offset_end``' intrinsic reads the end offset of the
reserved shared memory region.
The '``reserved_smem_offset_cap``' intrinsic reads the capacity limit of
the reserved shared memory region.
The '``reserved_smem_offset_0``' and '``reserved_smem_offset_1``' intrinsics
read additional offsets in the reserved shared memory region.
For more information, refer `PTX ISA
<https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-reserved-smem>`__.
Barriers
--------

View File

@ -2352,7 +2352,8 @@ foreach vec = [TV_I8, TV_I16, TV_I32,
// Accessing special registers.
//
class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = [], string name = "">
: NVVMPureIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>] # properties, name>;
: NVVMPureIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>] # properties,
!if(!eq(name, ""), "", !strconcat("llvm.nvvm.read.ptx.sreg.", name))>;
class PTXReadSRegIntrinsic_r32<list<IntrinsicProperty> properties = []>
: PTXReadSRegIntrinsicNB_r32<properties>, NVVMBuiltin;
@ -2452,12 +2453,26 @@ defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32<MAX_GR
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
//
// Reserved Shared Memory Intrinsics
//
def int_nvvm_read_ptx_sreg_reserved_smem_offset_begin
: PTXReadSRegIntrinsicNB_r32<name = "reserved_smem_offset_begin">;
def int_nvvm_read_ptx_sreg_reserved_smem_offset_end
: PTXReadSRegIntrinsicNB_r32<name = "reserved_smem_offset_end">;
def int_nvvm_read_ptx_sreg_reserved_smem_offset_cap
: PTXReadSRegIntrinsicNB_r32<name = "reserved_smem_offset_cap">;
def int_nvvm_read_ptx_sreg_reserved_smem_offset_0
: PTXReadSRegIntrinsicNB_r32<name = "reserved_smem_offset_0">;
def int_nvvm_read_ptx_sreg_reserved_smem_offset_1
: PTXReadSRegIntrinsicNB_r32<name = "reserved_smem_offset_1">;
def int_nvvm_read_ptx_sreg_total_smem_size :
PTXReadSRegIntrinsicNB_r32<name = "llvm.nvvm.read.ptx.sreg.total_smem_size">;
PTXReadSRegIntrinsicNB_r32<name = "total_smem_size">;
def int_nvvm_read_ptx_sreg_aggr_smem_size :
PTXReadSRegIntrinsicNB_r32<name = "llvm.nvvm.read.ptx.sreg.aggr_smem_size">;
PTXReadSRegIntrinsicNB_r32<name = "aggr_smem_size">;
def int_nvvm_read_ptx_sreg_dynamic_smem_size :
PTXReadSRegIntrinsicNB_r32<name = "llvm.nvvm.read.ptx.sreg.dynamic_smem_size">;
PTXReadSRegIntrinsicNB_r32<name = "dynamic_smem_size">;
//
// SHUFFLE

View File

@ -4926,6 +4926,14 @@ def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>;
def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>;
// Reserved shared memory special register reads
foreach suffix = ["begin", "end", "cap", "0", "1"] in {
defvar regname = "reserved_smem_offset_" # suffix;
defvar intr = !cast<Intrinsic>("int_nvvm_read_ptx_sreg_" # regname);
def "INT_PTX_SREG_RESERVED_SMEM_OFFSET_" # !toupper(suffix) :
PTX_READ_SREG_R32<regname, intr, [hasPTX<76>, hasSM<80>]>;
}
// TODO: It would be nice to use PTX_READ_SREG here, but it doesn't
// handle the constant.
def INT_PTX_SREG_WARPSIZE :

View File

@ -0,0 +1,40 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_80 -mattr=+ptx76 | FileCheck %s
; RUN: %if ptxas-sm_80 && ptxas-isa-7.6 %{ llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_80 -mattr=+ptx76 | %ptxas-verify -arch=sm_80 %}
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_begin()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_end()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_cap()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_0()
declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_1()
define i32 @test() {
; CHECK-LABEL: test(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.u32 %r1, %reserved_smem_offset_begin;
; CHECK-NEXT: mov.u32 %r2, %reserved_smem_offset_end;
; CHECK-NEXT: mov.u32 %r3, %reserved_smem_offset_cap;
; CHECK-NEXT: mov.u32 %r4, %reserved_smem_offset_0;
; CHECK-NEXT: mov.u32 %r5, %reserved_smem_offset_1;
; CHECK-NEXT: add.s32 %r6, %r1, %r2;
; CHECK-NEXT: add.s32 %r7, %r6, %r3;
; CHECK-NEXT: add.s32 %r8, %r7, %r4;
; CHECK-NEXT: add.s32 %r9, %r8, %r5;
; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
%begin = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_begin()
%end = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_end()
%cap = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_cap()
%offset0 = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_0()
%offset1 = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_1()
%ret0 = add i32 %begin, %end
%ret1 = add i32 %ret0, %cap
%ret2 = add i32 %ret1, %offset0
%ret3 = add i32 %ret2, %offset1
ret i32 %ret3
}