llvm-project/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
Alex MacLean d494eb0fa3
[NVPTX] Skip numbering unreferenced virtual registers (readability) (#154391)
When assigning numbers to registers, skip any with neither uses nor
defs. This is will not have any impact at all on the final SASS but it
makes for slightly more readable PTX. This change should also ensure
that future minor changes are less likely to cause noisy diffs in
register numbering.
2025-08-19 12:27:46 -07:00

726 lines
24 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck -check-prefix=SM20 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -check-prefix=SM35 %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"
define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo1(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo1_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo1_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b32 %r1, [%rd2];
; SM20-NEXT: st.global.b32 [%rd4], %r1;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo1(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo1_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo1_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT: st.global.b32 [%rd4], %r1;
; SM35-NEXT: ret;
%1 = load float, ptr %from
store float %1, ptr %to
ret void
}
define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo2(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo2_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo2_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b64 %rd5, [%rd2];
; SM20-NEXT: st.global.b64 [%rd4], %rd5;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo2(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo2_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo2_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT: st.global.b64 [%rd4], %rd5;
; SM35-NEXT: ret;
%1 = load double, ptr %from
store double %1, ptr %to
ret void
}
define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo3(
; SM20: {
; SM20-NEXT: .reg .b16 %rs<2>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo3_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo3_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b16 %rs1, [%rd2];
; SM20-NEXT: st.global.b16 [%rd4], %rs1;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo3(
; SM35: {
; SM35-NEXT: .reg .b16 %rs<2>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo3_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo3_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b16 %rs1, [%rd2];
; SM35-NEXT: st.global.b16 [%rd4], %rs1;
; SM35-NEXT: ret;
%1 = load i16, ptr %from
store i16 %1, ptr %to
ret void
}
define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo4(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo4_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo4_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b32 %r1, [%rd2];
; SM20-NEXT: st.global.b32 [%rd4], %r1;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo4(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo4_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo4_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT: st.global.b32 [%rd4], %r1;
; SM35-NEXT: ret;
%1 = load i32, ptr %from
store i32 %1, ptr %to
ret void
}
define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo5(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo5_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo5_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b64 %rd5, [%rd2];
; SM20-NEXT: st.global.b64 [%rd4], %rd5;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo5(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo5_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo5_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT: st.global.b64 [%rd4], %rd5;
; SM35-NEXT: ret;
%1 = load i64, ptr %from
store i64 %1, ptr %to
ret void
}
; i128 is non standard integer in nvptx64
define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo6(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo6_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo6_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo6(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo6_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo6_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT: ret;
%1 = load i128, ptr %from
store i128 %1, ptr %to
ret void
}
define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo7(
; SM20: {
; SM20-NEXT: .reg .b16 %rs<3>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo7_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo7_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b8 {%rs1, %rs2}, [%rd2];
; SM20-NEXT: st.global.v2.b8 [%rd4], {%rs1, %rs2};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo7(
; SM35: {
; SM35-NEXT: .reg .b16 %rs<3>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo7_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo7_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
; SM35-NEXT: st.global.v2.b8 [%rd4], {%rs1, %rs2};
; SM35-NEXT: ret;
%1 = load <2 x i8>, ptr %from
store <2 x i8> %1, ptr %to
ret void
}
define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo8(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo8_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo8_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b32 %r1, [%rd2];
; SM20-NEXT: st.global.b32 [%rd4], %r1;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo8(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo8_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo8_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT: st.global.b32 [%rd4], %r1;
; SM35-NEXT: ret;
%1 = load <2 x i16>, ptr %from
store <2 x i16> %1, ptr %to
ret void
}
define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo9(
; SM20: {
; SM20-NEXT: .reg .b32 %r<3>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo9_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo9_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2];
; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo9(
; SM35: {
; SM35-NEXT: .reg .b32 %r<3>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo9_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo9_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2];
; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
; SM35-NEXT: ret;
%1 = load <2 x i32>, ptr %from
store <2 x i32> %1, ptr %to
ret void
}
define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo10(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo10_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo10_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo10(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo10_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo10_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT: ret;
%1 = load <2 x i64>, ptr %from
store <2 x i64> %1, ptr %to
ret void
}
define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo11(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo11_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo11_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b64 %rd5, [%rd2];
; SM20-NEXT: st.global.b64 [%rd4], %rd5;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo11(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo11_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo11_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT: st.global.b64 [%rd4], %rd5;
; SM35-NEXT: ret;
%1 = load <2 x float>, ptr %from
store <2 x float> %1, ptr %to
ret void
}
define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo12(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo12_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo12_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo12(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo12_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo12_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT: ret;
%1 = load <2 x double>, ptr %from
store <2 x double> %1, ptr %to
ret void
}
define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo13(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo13_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo13_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b32 %r1, [%rd2];
; SM20-NEXT: st.global.b32 [%rd4], %r1;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo13(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo13_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo13_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT: st.global.b32 [%rd4], %r1;
; SM35-NEXT: ret;
%1 = load <4 x i8>, ptr %from
store <4 x i8> %1, ptr %to
ret void
}
define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo14(
; SM20: {
; SM20-NEXT: .reg .b32 %r<3>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo14_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo14_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2];
; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo14(
; SM35: {
; SM35-NEXT: .reg .b32 %r<3>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo14_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo14_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2];
; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2};
; SM35-NEXT: ret;
%1 = load <4 x i16>, ptr %from
store <4 x i16> %1, ptr %to
ret void
}
define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo15(
; SM20: {
; SM20-NEXT: .reg .b32 %r<5>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo15_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo15_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo15(
; SM35: {
; SM35-NEXT: .reg .b32 %r<5>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo15_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo15_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
; SM35-NEXT: ret;
%1 = load <4 x i32>, ptr %from
store <4 x i32> %1, ptr %to
ret void
}
define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo16(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo16_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo16_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo16(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo16_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo16_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT: ret;
%1 = load <4 x float>, ptr %from
store <4 x float> %1, ptr %to
ret void
}
define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo17(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<9>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo17_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo17_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT: ld.global.v2.b64 {%rd7, %rd8}, [%rd2+16];
; SM20-NEXT: st.global.v2.b64 [%rd4+16], {%rd7, %rd8};
; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT: ret;
;
; SM35-LABEL: foo17(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<9>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo17_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo17_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT: ld.global.nc.v2.b64 {%rd7, %rd8}, [%rd2+16];
; SM35-NEXT: st.global.v2.b64 [%rd4+16], {%rd7, %rd8};
; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT: ret;
%1 = load <4 x double>, ptr %from
store <4 x double> %1, ptr %to
ret void
}
define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo18(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [foo18_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT: ld.param.b64 %rd3, [foo18_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT: ld.global.b64 %rd5, [%rd2];
; SM20-NEXT: st.global.b64 [%rd4], %rd5;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo18(
; SM35: {
; SM35-NEXT: .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [foo18_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT: ld.param.b64 %rd3, [foo18_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT: st.global.b64 [%rd4], %rd5;
; SM35-NEXT: ret;
%1 = load ptr, ptr %from
store ptr %1, ptr %to
ret void
}
; Test that we can infer a cached load for a pointer induction variable.
define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
; SM20-LABEL: foo19(
; SM20: {
; SM20-NEXT: .reg .pred %p<2>;
; SM20-NEXT: .reg .b32 %r<4>;
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0: // %entry
; SM20-NEXT: ld.param.b32 %r2, [foo19_param_2];
; SM20-NEXT: ld.param.b64 %rd2, [foo19_param_0];
; SM20-NEXT: cvta.to.global.u64 %rd4, %rd2;
; SM20-NEXT: ld.param.b64 %rd3, [foo19_param_1];
; SM20-NEXT: cvta.to.global.u64 %rd1, %rd3;
; SM20-NEXT: mov.b32 %r3, 0f00000000;
; SM20-NEXT: $L__BB18_1: // %loop
; SM20-NEXT: // =>This Inner Loop Header: Depth=1
; SM20-NEXT: ld.global.b32 %r1, [%rd4];
; SM20-NEXT: add.rn.f32 %r3, %r1, %r3;
; SM20-NEXT: add.s64 %rd4, %rd4, 4;
; SM20-NEXT: add.s32 %r2, %r2, -1;
; SM20-NEXT: setp.ne.b32 %p1, %r2, 0;
; SM20-NEXT: @%p1 bra $L__BB18_1;
; SM20-NEXT: // %bb.2: // %exit
; SM20-NEXT: st.global.b32 [%rd1], %r3;
; SM20-NEXT: ret;
;
; SM35-LABEL: foo19(
; SM35: {
; SM35-NEXT: .reg .pred %p<2>;
; SM35-NEXT: .reg .b32 %r<4>;
; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0: // %entry
; SM35-NEXT: ld.param.b32 %r2, [foo19_param_2];
; SM35-NEXT: ld.param.b64 %rd2, [foo19_param_0];
; SM35-NEXT: cvta.to.global.u64 %rd4, %rd2;
; SM35-NEXT: ld.param.b64 %rd3, [foo19_param_1];
; SM35-NEXT: cvta.to.global.u64 %rd1, %rd3;
; SM35-NEXT: mov.b32 %r3, 0f00000000;
; SM35-NEXT: $L__BB18_1: // %loop
; SM35-NEXT: // =>This Inner Loop Header: Depth=1
; SM35-NEXT: ld.global.nc.b32 %r1, [%rd4];
; SM35-NEXT: add.rn.f32 %r3, %r1, %r3;
; SM35-NEXT: add.s64 %rd4, %rd4, 4;
; SM35-NEXT: add.s32 %r2, %r2, -1;
; SM35-NEXT: setp.ne.b32 %p1, %r2, 0;
; SM35-NEXT: @%p1 bra $L__BB18_1;
; SM35-NEXT: // %bb.2: // %exit
; SM35-NEXT: st.global.b32 [%rd1], %r3;
; SM35-NEXT: ret;
entry:
br label %loop
loop:
%i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
%sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
%ptr = getelementptr inbounds float, ptr %from, i32 %i
%value = load float, ptr %ptr, align 4
%nextsum = fadd float %value, %sum
%nexti = add nsw i32 %i, 1
%exitcond = icmp eq i32 %nexti, %n
br i1 %exitcond, label %exit, label %loop
exit:
store float %nextsum, ptr %to
ret void
}
; This test captures the case of a non-kernel function. In a
; non-kernel function, without interprocedural analysis, we do not
; know that the parameter is global. We also do not know that the
; pointed-to memory is never written to (for the duration of the
; kernel). For both reasons, we cannot use a cached load here.
define void @notkernel(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: notkernel(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [notkernel_param_0];
; SM20-NEXT: ld.b32 %r1, [%rd1];
; SM20-NEXT: ld.param.b64 %rd2, [notkernel_param_1];
; SM20-NEXT: st.b32 [%rd2], %r1;
; SM20-NEXT: ret;
;
; SM35-LABEL: notkernel(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [notkernel_param_0];
; SM35-NEXT: ld.b32 %r1, [%rd1];
; SM35-NEXT: ld.param.b64 %rd2, [notkernel_param_1];
; SM35-NEXT: st.b32 [%rd2], %r1;
; SM35-NEXT: ret;
%1 = load float, ptr %from
store float %1, ptr %to
ret void
}
; As @notkernel, but with the parameter explicitly marked as global. We still
; do not know that the parameter is never written to (for the duration of the
; kernel). This case does not currently come up normally since we do not infer
; that pointers are global interprocedurally as of 2015-08-05.
define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) {
; SM20-LABEL: notkernel2(
; SM20: {
; SM20-NEXT: .reg .b32 %r<2>;
; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.b64 %rd1, [notkernel2_param_0];
; SM20-NEXT: ld.global.b32 %r1, [%rd1];
; SM20-NEXT: ld.param.b64 %rd2, [notkernel2_param_1];
; SM20-NEXT: st.b32 [%rd2], %r1;
; SM20-NEXT: ret;
;
; SM35-LABEL: notkernel2(
; SM35: {
; SM35-NEXT: .reg .b32 %r<2>;
; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.b64 %rd1, [notkernel2_param_0];
; SM35-NEXT: ld.global.b32 %r1, [%rd1];
; SM35-NEXT: ld.param.b64 %rd2, [notkernel2_param_1];
; SM35-NEXT: st.b32 [%rd2], %r1;
; SM35-NEXT: ret;
%1 = load float, ptr addrspace(1) %from
store float %1, ptr %to
ret void
}