llvm-project/llvm/test/CodeGen/NVPTX/addrspacecast.ll
Alex MacLean 369891b674
[NVPTX] use untyped loads and stores where ever possible (#137698)
In most cases, the type information attached to load and store
instructions is meaningless and inconsistently applied. We can usually
use ".b" loads and avoid the complexity of trying to assign the correct
type. The one expectation is sign-extending load, which will continue to
use ".s" to ensure the sign extension into a larger register is done
correctly.
2025-05-10 08:26:26 -07:00

204 lines
6.5 KiB
LLVM

; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32
; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64
; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64
; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
; ALL-LABEL: conv1
define i32 @conv1(ptr addrspace(1) %ptr) {
; CLS32: cvta.global.u32
; ALL-NOT: cvt.u64.u32
; CLS64: cvta.global.u64
; ALL: ld.b32
%genptr = addrspacecast ptr addrspace(1) %ptr to ptr
%val = load i32, ptr %genptr
ret i32 %val
}
; ALL-LABEL: conv2
define i32 @conv2(ptr addrspace(3) %ptr) {
; CLS32: cvta.shared.u32
; PTRCONV: cvt.u64.u32
; NOPTRCONV-NOT: cvt.u64.u32
; CLS64: cvta.shared.u64
; ALL: ld.b32
%genptr = addrspacecast ptr addrspace(3) %ptr to ptr
%val = load i32, ptr %genptr
ret i32 %val
}
; ALL-LABEL: conv3
define i32 @conv3(ptr addrspace(4) %ptr) {
; CLS32: cvta.const.u32
; PTRCONV: cvt.u64.u32
; NOPTRCONV-NOT: cvt.u64.u32
; CLS64: cvta.const.u64
; ALL: ld.b32
%genptr = addrspacecast ptr addrspace(4) %ptr to ptr
%val = load i32, ptr %genptr
ret i32 %val
}
; ALL-LABEL: conv4
define i32 @conv4(ptr addrspace(5) %ptr) {
; CLS32: cvta.local.u32
; PTRCONV: cvt.u64.u32
; NOPTRCONV-NOT: cvt.u64.u32
; CLS64: cvta.local.u64
; ALL: ld.b32
%genptr = addrspacecast ptr addrspace(5) %ptr to ptr
%val = load i32, ptr %genptr
ret i32 %val
}
; ALL-LABEL: conv5
define i32 @conv5(ptr %ptr) {
; CLS32: cvta.to.global.u32
; ALL-NOT: cvt.u64.u32
; CLS64: cvta.to.global.u64
; ALL: ld.global.b32
%specptr = addrspacecast ptr %ptr to ptr addrspace(1)
%val = load i32, ptr addrspace(1) %specptr
ret i32 %val
}
; ALL-LABEL: conv6
define i32 @conv6(ptr %ptr) {
; CLS32: cvta.to.shared.u32
; CLS64: cvta.to.shared.u64
; PTRCONV: cvt.u32.u64
; NOPTRCONV-NOT: cvt.u32.u64
; ALL: ld.shared.b32
%specptr = addrspacecast ptr %ptr to ptr addrspace(3)
%val = load i32, ptr addrspace(3) %specptr
ret i32 %val
}
; ALL-LABEL: conv7
define i32 @conv7(ptr %ptr) {
; CLS32: cvta.to.const.u32
; CLS64: cvta.to.const.u64
; PTRCONV: cvt.u32.u64
; NOPTRCONV-NOT: cvt.u32.u64
; ALL: ld.const.b32
%specptr = addrspacecast ptr %ptr to ptr addrspace(4)
%val = load i32, ptr addrspace(4) %specptr
ret i32 %val
}
; ALL-LABEL: conv8
define i32 @conv8(ptr %ptr) {
; CLS32: cvta.to.local.u32
; CLS64: cvta.to.local.u64
; PTRCONV: cvt.u32.u64
; NOPTRCONV-NOT: cvt.u32.u64
; ALL: ld.local.b32
%specptr = addrspacecast ptr %ptr to ptr addrspace(5)
%val = load i32, ptr addrspace(5) %specptr
ret i32 %val
}
; ALL-LABEL: conv9
define i32 @conv9(ptr addrspace(1) %ptr) {
; CLS32: // implicit-def: %[[ADDR:r[0-9]+]]
; PTRCONV: // implicit-def: %[[ADDR:r[0-9]+]]
; NOPTRCONV: // implicit-def: %[[ADDR:rd[0-9]+]]
; ALL: ld.shared.b32 %r{{[0-9]+}}, [%[[ADDR]]]
%specptr = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(3)
%val = load i32, ptr addrspace(3) %specptr
ret i32 %val
}
; Check that we support addrspacecast when splitting the vector
; result (<2 x ptr> => 2 x <1 x ptr>).
; This also checks that scalarization works for addrspacecast
; (when going from <1 x ptr> to ptr.)
; ALL-LABEL: split1To0
define void @split1To0(ptr nocapture noundef readonly %xs) {
; CLS32: cvta.global.u32
; CLS32: cvta.global.u32
; CLS64: cvta.global.u64
; CLS64: cvta.global.u64
; ALL: st.b32
; ALL: st.b32
%vec_addr = load <2 x ptr addrspace(1)>, ptr %xs, align 16
%addrspacecast = addrspacecast <2 x ptr addrspace(1)> %vec_addr to <2 x ptr>
%extractelement0 = extractelement <2 x ptr> %addrspacecast, i64 0
store float 0.5, ptr %extractelement0, align 4
%extractelement1 = extractelement <2 x ptr> %addrspacecast, i64 1
store float 1.0, ptr %extractelement1, align 4
ret void
}
; Same as split1To0 but from 0 to 1, to make sure the addrspacecast preserve
; the source and destination addrspaces properly.
; ALL-LABEL: split0To1
define void @split0To1(ptr nocapture noundef readonly %xs) {
; CLS32: cvta.to.global.u32
; CLS32: cvta.to.global.u32
; CLS64: cvta.to.global.u64
; CLS64: cvta.to.global.u64
; ALL: st.global.b32
; ALL: st.global.b32
%vec_addr = load <2 x ptr>, ptr %xs, align 16
%addrspacecast = addrspacecast <2 x ptr> %vec_addr to <2 x ptr addrspace(1)>
%extractelement0 = extractelement <2 x ptr addrspace(1)> %addrspacecast, i64 0
store float 0.5, ptr addrspace(1) %extractelement0, align 4
%extractelement1 = extractelement <2 x ptr addrspace(1)> %addrspacecast, i64 1
store float 1.0, ptr addrspace(1) %extractelement1, align 4
ret void
}
; Check that we support addrspacecast when a widening is required
; (3 x ptr => 4 x ptr).
; ALL-LABEL: widen1To0
define void @widen1To0(ptr nocapture noundef readonly %xs) {
; CLS32: cvta.global.u32
; CLS32: cvta.global.u32
; CLS32: cvta.global.u32
; CLS64: cvta.global.u64
; CLS64: cvta.global.u64
; CLS64: cvta.global.u64
; ALL: st.b32
; ALL: st.b32
; ALL: st.b32
%vec_addr = load <3 x ptr addrspace(1)>, ptr %xs, align 16
%addrspacecast = addrspacecast <3 x ptr addrspace(1)> %vec_addr to <3 x ptr>
%extractelement0 = extractelement <3 x ptr> %addrspacecast, i64 0
store float 0.5, ptr %extractelement0, align 4
%extractelement1 = extractelement <3 x ptr> %addrspacecast, i64 1
store float 1.0, ptr %extractelement1, align 4
%extractelement2 = extractelement <3 x ptr> %addrspacecast, i64 2
store float 1.5, ptr %extractelement2, align 4
ret void
}
; Same as widen1To0 but from 0 to 1, to make sure the addrspacecast preserve
; the source and destination addrspaces properly.
; ALL-LABEL: widen0To1
define void @widen0To1(ptr nocapture noundef readonly %xs) {
; CLS32: cvta.to.global.u32
; CLS32: cvta.to.global.u32
; CLS32: cvta.to.global.u32
; CLS64: cvta.to.global.u64
; CLS64: cvta.to.global.u64
; CLS64: cvta.to.global.u64
; ALL: st.global.b32
; ALL: st.global.b32
; ALL: st.global.b32
%vec_addr = load <3 x ptr>, ptr %xs, align 16
%addrspacecast = addrspacecast <3 x ptr> %vec_addr to <3 x ptr addrspace(1)>
%extractelement0 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 0
store float 0.5, ptr addrspace(1) %extractelement0, align 4
%extractelement1 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 1
store float 1.0, ptr addrspace(1) %extractelement1, align 4
%extractelement2 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 2
store float 1.5, ptr addrspace(1) %extractelement2, align 4
ret void
}