From dbaa4d117aac3bf25505ca78a4b9752e1bb9c3dc Mon Sep 17 00:00:00 2001 From: yasmincs Date: Thu, 26 Feb 2026 14:34:59 -0800 Subject: [PATCH] [NVPTX] Support intrinsics for reserved shared memory special registers (#182354) Added reserved_smem_offset_{begin|end|cap|0} intrinsics to expose shared memory special registers and NVPTX TableGen support for these intrinsics. --- llvm/docs/NVPTXUsage.rst | 36 +++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 23 +++++++++-- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 8 ++++ .../CodeGen/NVPTX/reserved-smem-offset.ll | 40 +++++++++++++++++++ 4 files changed, 103 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/reserved-smem-offset.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 0e7e21ad46b8..f4dd87ea8226 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -300,6 +300,42 @@ including the portion reserved for system software use. The '``dynamic_smem_size``' variant returns the amount of dynamic shared memory allocated per CTA for the kernel at launch time. +'``llvm.nvvm.read.ptx.sreg.reserved_smem_offset_*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_begin() + declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_end() + declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_cap() + declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_0() + declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_1() + +Overview: +""""""""" + +The '``@llvm.nvvm.read.ptx.sreg.reserved_smem_offset_*``' intrinsics provide +access to PTX special registers that hold information about reserved shared +memory offsets. + +The '``reserved_smem_offset_begin``' intrinsic reads the start offset of +the reserved shared memory region. + +The '``reserved_smem_offset_end``' intrinsic reads the end offset of the +reserved shared memory region. + +The '``reserved_smem_offset_cap``' intrinsic reads the capacity limit of +the reserved shared memory region. + +The '``reserved_smem_offset_0``' and '``reserved_smem_offset_1``' intrinsics +read additional offsets in the reserved shared memory region. + +For more information, refer `PTX ISA +`__. + Barriers -------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index f0c3c3208695..b3e0033d005a 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2352,7 +2352,8 @@ foreach vec = [TV_I8, TV_I16, TV_I32, // Accessing special registers. // class PTXReadSRegIntrinsicNB_r32 properties = [], string name = ""> - : NVVMPureIntrinsic<[llvm_i32_ty], [], [NoUndef] # properties, name>; + : NVVMPureIntrinsic<[llvm_i32_ty], [], [NoUndef] # properties, + !if(!eq(name, ""), "", !strconcat("llvm.nvvm.read.ptx.sreg.", name))>; class PTXReadSRegIntrinsic_r32 properties = []> : PTXReadSRegIntrinsicNB_r32, NVVMBuiltin; @@ -2452,12 +2453,26 @@ defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32; +def int_nvvm_read_ptx_sreg_reserved_smem_offset_end + : PTXReadSRegIntrinsicNB_r32; +def int_nvvm_read_ptx_sreg_reserved_smem_offset_cap + : PTXReadSRegIntrinsicNB_r32; +def int_nvvm_read_ptx_sreg_reserved_smem_offset_0 + : PTXReadSRegIntrinsicNB_r32; +def int_nvvm_read_ptx_sreg_reserved_smem_offset_1 + : PTXReadSRegIntrinsicNB_r32; + def int_nvvm_read_ptx_sreg_total_smem_size : - PTXReadSRegIntrinsicNB_r32; + PTXReadSRegIntrinsicNB_r32; def int_nvvm_read_ptx_sreg_aggr_smem_size : - PTXReadSRegIntrinsicNB_r32; + PTXReadSRegIntrinsicNB_r32; def int_nvvm_read_ptx_sreg_dynamic_smem_size : - PTXReadSRegIntrinsicNB_r32; + PTXReadSRegIntrinsicNB_r32; // // SHUFFLE diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 090ee63589fc..26f5f3f5160f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -4926,6 +4926,14 @@ def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>; def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>; def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>; +// Reserved shared memory special register reads +foreach suffix = ["begin", "end", "cap", "0", "1"] in { + defvar regname = "reserved_smem_offset_" # suffix; + defvar intr = !cast("int_nvvm_read_ptx_sreg_" # regname); + def "INT_PTX_SREG_RESERVED_SMEM_OFFSET_" # !toupper(suffix) : + PTX_READ_SREG_R32, hasSM<80>]>; +} + // TODO: It would be nice to use PTX_READ_SREG here, but it doesn't // handle the constant. def INT_PTX_SREG_WARPSIZE : diff --git a/llvm/test/CodeGen/NVPTX/reserved-smem-offset.ll b/llvm/test/CodeGen/NVPTX/reserved-smem-offset.ll new file mode 100644 index 000000000000..46c010fb614a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/reserved-smem-offset.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_80 -mattr=+ptx76 | FileCheck %s +; RUN: %if ptxas-sm_80 && ptxas-isa-7.6 %{ llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_80 -mattr=+ptx76 | %ptxas-verify -arch=sm_80 %} + +declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_begin() +declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_end() +declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_cap() +declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_0() +declare i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_1() + +define i32 @test() { +; CHECK-LABEL: test( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %reserved_smem_offset_begin; +; CHECK-NEXT: mov.u32 %r2, %reserved_smem_offset_end; +; CHECK-NEXT: mov.u32 %r3, %reserved_smem_offset_cap; +; CHECK-NEXT: mov.u32 %r4, %reserved_smem_offset_0; +; CHECK-NEXT: mov.u32 %r5, %reserved_smem_offset_1; +; CHECK-NEXT: add.s32 %r6, %r1, %r2; +; CHECK-NEXT: add.s32 %r7, %r6, %r3; +; CHECK-NEXT: add.s32 %r8, %r7, %r4; +; CHECK-NEXT: add.s32 %r9, %r8, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: ret; + %begin = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_begin() + %end = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_end() + %cap = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_cap() + %offset0 = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_0() + %offset1 = call i32 @llvm.nvvm.read.ptx.sreg.reserved_smem_offset_1() + + %ret0 = add i32 %begin, %end + %ret1 = add i32 %ret0, %cap + %ret2 = add i32 %ret1, %offset0 + %ret3 = add i32 %ret2, %offset1 + + ret i32 %ret3 +}