llvm-project/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
Princeton Ferro 1fdbe69849
[NVPTX] support f32x2 instructions for sm_100+ (#126337)
Lower `fadd`, `fsub`, `fmul`, and `fma` to f32x2 variants introduced in
PTX 8.6 for sm_100+. Adds a new register class for v2f32 as a b64
register in PTX. This causes other vector operations like loads and
stores to lower as .b64 instead of .v2.b32 as appropriate.

Also update test cases to use the autogenerator.
2025-07-11 11:50:50 -07:00

540 lines
20 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX
; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
; In this test, we check that all the addressing modes are lowered correctly,
; addr can be any of the following:
; - avar : direct address
; - asi: direct address + offset
; - areg_64: 64-bit register
; - ari_64: 64-bit register + offset
; Since this is a blackwell+ feature,
; and support for 32-bit addressing does not exist after sm_90,
; the "areg" and "ari" 32-bit addressing modes are not tested or supported.
; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double
; Global is the only address space that currently supports 256-bit loads/stores
@globalin = external addrspace(1) global ptr
@globalout = external addrspace(1) global ptr
define void @avar_i8() {
; PTX-LABEL: avar_i8(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <32 x i8>, ptr addrspace(1) @globalin
store <32 x i8> %load, ptr addrspace(1) @globalout
ret void
}
define void @avar_i16() {
; PTX-LABEL: avar_i16(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <16 x i16>, ptr addrspace(1) @globalin
store <16 x i16> %load, ptr addrspace(1) @globalout
ret void
}
define void @avar_half() {
; PTX-LABEL: avar_half(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <16 x half>, ptr addrspace(1) @globalin
store <16 x half> %load, ptr addrspace(1) @globalout
ret void
}
define void @avar_bfloat() {
; PTX-LABEL: avar_bfloat(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <16 x bfloat>, ptr addrspace(1) @globalin
store <16 x bfloat> %load, ptr addrspace(1) @globalout
ret void
}
define void @avar_i32() {
; PTX-LABEL: avar_i32(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <8 x i32>, ptr addrspace(1) @globalin
store <8 x i32> %load, ptr addrspace(1) @globalout
ret void
}
define void @avar_i64() {
; PTX-LABEL: avar_i64(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%load = load <4 x i64>, ptr addrspace(1) @globalin
store <4 x i64> %load, ptr addrspace(1) @globalout
ret void
}
define void @avar_float() {
; PTX-LABEL: avar_float(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%load = load <8 x float>, ptr addrspace(1) @globalin
store <8 x float> %load, ptr addrspace(1) @globalout
ret void
}
define void @avar_double() {
; PTX-LABEL: avar_double(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%load = load <4 x double>, ptr addrspace(1) @globalin
store <4 x double> %load, ptr addrspace(1) @globalout
ret void
}
define void @asi_i8() {
; PTX-LABEL: asi_i8(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <32 x i8>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <32 x i8> %load, ptr addrspace(1) %out.offset
ret void
}
define void @asi_i16() {
; PTX-LABEL: asi_i16(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <16 x i16>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <16 x i16> %load, ptr addrspace(1) %out.offset
ret void
}
define void @asi_half() {
; PTX-LABEL: asi_half(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <16 x half>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <16 x half> %load, ptr addrspace(1) %out.offset
ret void
}
define void @asi_bfloat() {
; PTX-LABEL: asi_bfloat(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
ret void
}
define void @asi_i32() {
; PTX-LABEL: asi_i32(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <8 x i32>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <8 x i32> %load, ptr addrspace(1) %out.offset
ret void
}
define void @asi_i64() {
; PTX-LABEL: asi_i64(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <4 x i64>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <4 x i64> %load, ptr addrspace(1) %out.offset
ret void
}
define void @asi_float() {
; PTX-LABEL: asi_float(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <8 x float>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <8 x float> %load, ptr addrspace(1) %out.offset
ret void
}
define void @asi_double() {
; PTX-LABEL: asi_double(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
%load = load <4 x double>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
store <4 x double> %load, ptr addrspace(1) %out.offset
ret void
}
define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i8(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1];
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <32 x i8>, ptr addrspace(1) %in
store <32 x i8> %load, ptr addrspace(1) %out
ret void
}
define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i16(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1];
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <16 x i16>, ptr addrspace(1) %in
store <16 x i16> %load, ptr addrspace(1) %out
ret void
}
define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_half(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1];
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <16 x half>, ptr addrspace(1) %in
store <16 x half> %load, ptr addrspace(1) %out
ret void
}
define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_bfloat(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1];
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <16 x bfloat>, ptr addrspace(1) %in
store <16 x bfloat> %load, ptr addrspace(1) %out
ret void
}
define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i32(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1];
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%load = load <8 x i32>, ptr addrspace(1) %in
store <8 x i32> %load, ptr addrspace(1) %out
ret void
}
define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i64(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0];
; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1];
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT: ret;
%load = load <4 x i64>, ptr addrspace(1) %in
store <4 x i64> %load, ptr addrspace(1) %out
ret void
}
define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_float(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0];
; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_float_param_1];
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT: ret;
%load = load <8 x float>, ptr addrspace(1) %in
store <8 x float> %load, ptr addrspace(1) %out
ret void
}
define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_double(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0];
; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_double_param_1];
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT: ret;
%load = load <4 x double>, ptr addrspace(1) %in
store <4 x double> %load, ptr addrspace(1) %out
ret void
}
define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i8(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <32 x i8>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <32 x i8> %load, ptr addrspace(1) %out.offset
ret void
}
define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i16(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <16 x i16>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <16 x i16> %load, ptr addrspace(1) %out.offset
ret void
}
define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_half(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <16 x half>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <16 x half> %load, ptr addrspace(1) %out.offset
ret void
}
define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_bfloat(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
ret void
}
define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i32(
; PTX: {
; PTX-NEXT: .reg .b32 %r<9>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1];
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <8 x i32>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <8 x i32> %load, ptr addrspace(1) %out.offset
ret void
}
define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i64(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1];
; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <4 x i64>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <4 x i64> %load, ptr addrspace(1) %out.offset
ret void
}
define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_float(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1];
; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <8 x float>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <8 x float> %load, ptr addrspace(1) %out.offset
ret void
}
define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_double(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0];
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1];
; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT: ret;
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
%load = load <4 x double>, ptr addrspace(1) %in.offset
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
store <4 x double> %load, ptr addrspace(1) %out.offset
ret void
}