
Lower `fadd`, `fsub`, `fmul`, and `fma` to f32x2 variants introduced in PTX 8.6 for sm_100+. Adds a new register class for v2f32 as a b64 register in PTX. This causes other vector operations like loads and stores to lower as .b64 instead of .v2.b32 as appropriate. Also update test cases to use the autogenerator.
546 lines
21 KiB
LLVM
546 lines
21 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX
|
|
; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
|
|
|
|
; In this test, we check that all the addressing modes are lowered correctly
|
|
; for 256-bit invariant loads, which get lowered to ld.global.nc
|
|
; addr can be any of the following:
|
|
; - avar : direct address
|
|
; - asi: direct address + offset
|
|
; - areg_64: 64-bit register
|
|
; - ari_64: 64-bit register + offset
|
|
; Since this is a blackwell+ feature,
|
|
; and support for 32-bit addressing does not exist after sm_90,
|
|
; the "areg" and "ari" 32-bit addressing modes are not tested or supported.
|
|
|
|
; For invariant loads, asi is historically not supported,
|
|
; and instead it is selected as move into register, add of offset, and loaded as areg64
|
|
|
|
; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double
|
|
|
|
; Global is the only address space that currently supports 256-bit loads/stores
|
|
|
|
@globalin = external addrspace(1) global ptr
|
|
@globalout = external addrspace(1) global ptr
|
|
|
|
define void @avar_i8() {
|
|
; PTX-LABEL: avar_i8(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <32 x i8>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <32 x i8> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @avar_i16() {
|
|
; PTX-LABEL: avar_i16(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <16 x i16>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <16 x i16> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @avar_half() {
|
|
; PTX-LABEL: avar_half(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <16 x half>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <16 x half> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @avar_bfloat() {
|
|
; PTX-LABEL: avar_bfloat(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <16 x bfloat>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <16 x bfloat> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @avar_i32() {
|
|
; PTX-LABEL: avar_i32(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <8 x i32> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @avar_i64() {
|
|
; PTX-LABEL: avar_i64(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<5>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
|
|
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
|
|
; PTX-NEXT: ret;
|
|
%load = load <4 x i64>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <4 x i64> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @avar_float() {
|
|
; PTX-LABEL: avar_float(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<5>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
|
|
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
|
|
; PTX-NEXT: ret;
|
|
%load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <8 x float> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @avar_double() {
|
|
; PTX-LABEL: avar_double(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<5>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
|
|
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
|
|
; PTX-NEXT: ret;
|
|
%load = load <4 x double>, ptr addrspace(1) @globalin, !invariant.load !0
|
|
store <4 x double> %load, ptr addrspace(1) @globalout
|
|
ret void
|
|
}
|
|
|
|
define void @asi_i8() {
|
|
; PTX-LABEL: asi_i8(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <32 x i8> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @asi_i16() {
|
|
; PTX-LABEL: asi_i16(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <16 x i16> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @asi_half() {
|
|
; PTX-LABEL: asi_half(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <16 x half> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @asi_bfloat() {
|
|
; PTX-LABEL: asi_bfloat(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @asi_i32() {
|
|
; PTX-LABEL: asi_i32(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
|
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <8 x i32> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @asi_i64() {
|
|
; PTX-LABEL: asi_i64(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<5>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
|
|
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <4 x i64> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @asi_float() {
|
|
; PTX-LABEL: asi_float(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<5>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
|
|
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <8 x float> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @asi_double() {
|
|
; PTX-LABEL: asi_double(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<5>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
|
|
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
|
%load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
|
store <4 x double> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_i8(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <32 x i8>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <32 x i8> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_i16(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <16 x i16>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <16 x i16> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_half(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <16 x half>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <16 x half> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_bfloat(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <16 x bfloat>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <16 x bfloat> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_i32(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <8 x i32> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_i64(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0];
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1];
|
|
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
|
|
; PTX-NEXT: ret;
|
|
%load = load <4 x i64>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <4 x i64> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_float(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0];
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_float_param_1];
|
|
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
|
|
; PTX-NEXT: ret;
|
|
%load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <8 x float> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: areg_64_double(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0];
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
|
|
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_double_param_1];
|
|
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
|
|
; PTX-NEXT: ret;
|
|
%load = load <4 x double>, ptr addrspace(1) %in, !invariant.load !0
|
|
store <4 x double> %load, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_i8(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <32 x i8> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_i16(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <16 x i16> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_half(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <16 x half> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_bfloat(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_i32(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<9>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1];
|
|
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <8 x i32> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_i64(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1];
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <4 x i64> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_float(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1];
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <8 x float> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
|
; PTX-LABEL: ari_64_double(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0];
|
|
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1];
|
|
; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
|
|
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
|
|
; PTX-NEXT: ret;
|
|
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
|
%load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
|
|
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
|
store <4 x double> %load, ptr addrspace(1) %out.offset
|
|
ret void
|
|
}
|
|
|
|
!0 = !{}
|