
Replace uses of BFE with PRMT when lowering v4i8 vectors. This will generally lead to equivalent or better SASS and reduces the number of target specific operations we need to represent. (https://cuda.godbolt.org/z/M75W6f8xd) Also implement KnownBits tracking for PRMT allowing elimination of redundant AND instructions when lowering various i8 operations.
292 lines
10 KiB
LLVM
292 lines
10 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
|
|
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
|
|
|
|
; Check that invariant loads from the global addrspace are lowered to
|
|
; ld.global.nc.
|
|
|
|
define i32 @ld_global(ptr addrspace(1) %ptr) {
|
|
; CHECK-LABEL: ld_global(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<2>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_param_0];
|
|
; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1];
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
|
; CHECK-NEXT: ret;
|
|
%a = load i32, ptr addrspace(1) %ptr, !invariant.load !0
|
|
ret i32 %a
|
|
}
|
|
|
|
define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
|
|
; Load of v2f16 is weird. We consider it to be a legal type, which happens to be
|
|
; loaded/stored as a 32-bit scalar.
|
|
; CHECK-LABEL: ld_global_v2f16(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b16 %rs<4>;
|
|
; CHECK-NEXT: .reg .b32 %r<4>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v2f16_param_0];
|
|
; CHECK-NEXT: ld.global.nc.v2.b16 {%rs1, %rs2}, [%rd1];
|
|
; CHECK-NEXT: cvt.f32.f16 %r1, %rs2;
|
|
; CHECK-NEXT: cvt.f32.f16 %r2, %rs1;
|
|
; CHECK-NEXT: add.rn.f32 %r3, %r2, %r1;
|
|
; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3;
|
|
; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
|
|
; CHECK-NEXT: ret;
|
|
%a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0
|
|
%v1 = extractelement <2 x half> %a, i32 0
|
|
%v2 = extractelement <2 x half> %a, i32 1
|
|
%sum = fadd half %v1, %v2
|
|
ret half %sum
|
|
}
|
|
|
|
define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
|
|
; Larger f16 vectors may be split into individual f16 elements and multiple
|
|
; loads/stores may be vectorized using f16 element type. Practically it's
|
|
; limited to v4 variant only.
|
|
; CHECK-LABEL: ld_global_v4f16(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b16 %rs<8>;
|
|
; CHECK-NEXT: .reg .b32 %r<10>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v4f16_param_0];
|
|
; CHECK-NEXT: ld.global.nc.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
|
|
; CHECK-NEXT: cvt.f32.f16 %r1, %rs2;
|
|
; CHECK-NEXT: cvt.f32.f16 %r2, %rs1;
|
|
; CHECK-NEXT: add.rn.f32 %r3, %r2, %r1;
|
|
; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r3;
|
|
; CHECK-NEXT: cvt.f32.f16 %r4, %rs4;
|
|
; CHECK-NEXT: cvt.f32.f16 %r5, %rs3;
|
|
; CHECK-NEXT: add.rn.f32 %r6, %r5, %r4;
|
|
; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r6;
|
|
; CHECK-NEXT: cvt.f32.f16 %r7, %rs6;
|
|
; CHECK-NEXT: cvt.f32.f16 %r8, %rs5;
|
|
; CHECK-NEXT: add.rn.f32 %r9, %r8, %r7;
|
|
; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %r9;
|
|
; CHECK-NEXT: st.param.b16 [func_retval0], %rs7;
|
|
; CHECK-NEXT: ret;
|
|
%a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0
|
|
%v1 = extractelement <4 x half> %a, i32 0
|
|
%v2 = extractelement <4 x half> %a, i32 1
|
|
%v3 = extractelement <4 x half> %a, i32 2
|
|
%v4 = extractelement <4 x half> %a, i32 3
|
|
%sum1 = fadd half %v1, %v2
|
|
%sum2 = fadd half %v3, %v4
|
|
%sum = fadd half %sum1, %sum2
|
|
ret half %sum
|
|
}
|
|
|
|
define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
|
|
; Larger vectors are, again, loaded as v4i32. PTX has no v8 variants of loads/stores,
|
|
; so load/store vectorizer has to convert v8f16 -> v4 x v2f16.
|
|
; CHECK-LABEL: ld_global_v8f16(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b16 %rs<8>;
|
|
; CHECK-NEXT: .reg .b32 %r<14>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v8f16_param_0];
|
|
; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
|
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
|
|
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; }
|
|
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; }
|
|
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs4, tmp}, %r2; }
|
|
; CHECK-NEXT: cvt.f32.f16 %r5, %rs4;
|
|
; CHECK-NEXT: cvt.f32.f16 %r6, %rs3;
|
|
; CHECK-NEXT: add.rn.f32 %r7, %r6, %r5;
|
|
; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r7;
|
|
; CHECK-NEXT: cvt.f32.f16 %r8, %rs2;
|
|
; CHECK-NEXT: cvt.f32.f16 %r9, %rs1;
|
|
; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8;
|
|
; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r10;
|
|
; CHECK-NEXT: cvt.f32.f16 %r11, %rs6;
|
|
; CHECK-NEXT: cvt.f32.f16 %r12, %rs5;
|
|
; CHECK-NEXT: add.rn.f32 %r13, %r12, %r11;
|
|
; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %r13;
|
|
; CHECK-NEXT: st.param.b16 [func_retval0], %rs7;
|
|
; CHECK-NEXT: ret;
|
|
%a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0
|
|
%v1 = extractelement <8 x half> %a, i32 0
|
|
%v2 = extractelement <8 x half> %a, i32 2
|
|
%v3 = extractelement <8 x half> %a, i32 4
|
|
%v4 = extractelement <8 x half> %a, i32 6
|
|
%sum1 = fadd half %v1, %v2
|
|
%sum2 = fadd half %v3, %v4
|
|
%sum = fadd half %sum1, %sum2
|
|
ret half %sum
|
|
}
|
|
|
|
define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
|
|
; CHECK-LABEL: ld_global_v8i8(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b16 %rs<8>;
|
|
; CHECK-NEXT: .reg .b32 %r<8>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v8i8_param_0];
|
|
; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
|
|
; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7772U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
|
|
; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
|
|
; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7772U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs3, %r5;
|
|
; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7770U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
|
|
; CHECK-NEXT: add.s16 %rs5, %rs4, %rs3;
|
|
; CHECK-NEXT: add.s16 %rs6, %rs2, %rs1;
|
|
; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6;
|
|
; CHECK-NEXT: cvt.u32.u16 %r7, %rs7;
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
|
|
; CHECK-NEXT: ret;
|
|
%a = load <8 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
|
|
%v1 = extractelement <8 x i8> %a, i32 0
|
|
%v2 = extractelement <8 x i8> %a, i32 2
|
|
%v3 = extractelement <8 x i8> %a, i32 4
|
|
%v4 = extractelement <8 x i8> %a, i32 6
|
|
%sum1 = add i8 %v1, %v2
|
|
%sum2 = add i8 %v3, %v4
|
|
%sum = add i8 %sum1, %sum2
|
|
ret i8 %sum
|
|
}
|
|
|
|
define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
|
|
; CHECK-LABEL: ld_global_v16i8(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b16 %rs<16>;
|
|
; CHECK-NEXT: .reg .b32 %r<14>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v16i8_param_0];
|
|
; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
|
; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7772U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
|
|
; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7770U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs2, %r6;
|
|
; CHECK-NEXT: prmt.b32 %r7, %r3, 0, 0x7772U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs3, %r7;
|
|
; CHECK-NEXT: prmt.b32 %r8, %r3, 0, 0x7770U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs4, %r8;
|
|
; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7772U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs5, %r9;
|
|
; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7770U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs6, %r10;
|
|
; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7772U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs7, %r11;
|
|
; CHECK-NEXT: prmt.b32 %r12, %r1, 0, 0x7770U;
|
|
; CHECK-NEXT: cvt.u16.u32 %rs8, %r12;
|
|
; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
|
|
; CHECK-NEXT: add.s16 %rs10, %rs6, %rs5;
|
|
; CHECK-NEXT: add.s16 %rs11, %rs4, %rs3;
|
|
; CHECK-NEXT: add.s16 %rs12, %rs2, %rs1;
|
|
; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10;
|
|
; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12;
|
|
; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14;
|
|
; CHECK-NEXT: cvt.u32.u16 %r13, %rs15;
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r13;
|
|
; CHECK-NEXT: ret;
|
|
%a = load <16 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
|
|
%v1 = extractelement <16 x i8> %a, i32 0
|
|
%v2 = extractelement <16 x i8> %a, i32 2
|
|
%v3 = extractelement <16 x i8> %a, i32 4
|
|
%v4 = extractelement <16 x i8> %a, i32 6
|
|
%v5 = extractelement <16 x i8> %a, i32 8
|
|
%v6 = extractelement <16 x i8> %a, i32 10
|
|
%v7 = extractelement <16 x i8> %a, i32 12
|
|
%v8 = extractelement <16 x i8> %a, i32 14
|
|
%sum1 = add i8 %v1, %v2
|
|
%sum2 = add i8 %v3, %v4
|
|
%sum3 = add i8 %v5, %v6
|
|
%sum4 = add i8 %v7, %v8
|
|
%sum5 = add i8 %sum1, %sum2
|
|
%sum6 = add i8 %sum3, %sum4
|
|
%sum7 = add i8 %sum5, %sum6
|
|
ret i8 %sum7
|
|
}
|
|
|
|
define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) {
|
|
; CHECK-LABEL: ld_global_v2i32(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<4>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v2i32_param_0];
|
|
; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
|
|
; CHECK-NEXT: add.s32 %r3, %r1, %r2;
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
|
; CHECK-NEXT: ret;
|
|
%a = load <2 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
|
|
%v1 = extractelement <2 x i32> %a, i32 0
|
|
%v2 = extractelement <2 x i32> %a, i32 1
|
|
%sum = add i32 %v1, %v2
|
|
ret i32 %sum
|
|
}
|
|
|
|
define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) {
|
|
; CHECK-LABEL: ld_global_v4i32(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<8>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v4i32_param_0];
|
|
; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
|
; CHECK-NEXT: add.s32 %r5, %r1, %r2;
|
|
; CHECK-NEXT: add.s32 %r6, %r3, %r4;
|
|
; CHECK-NEXT: add.s32 %r7, %r5, %r6;
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
|
|
; CHECK-NEXT: ret;
|
|
%a = load <4 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
|
|
%v1 = extractelement <4 x i32> %a, i32 0
|
|
%v2 = extractelement <4 x i32> %a, i32 1
|
|
%v3 = extractelement <4 x i32> %a, i32 2
|
|
%v4 = extractelement <4 x i32> %a, i32 3
|
|
%sum1 = add i32 %v1, %v2
|
|
%sum2 = add i32 %v3, %v4
|
|
%sum3 = add i32 %sum1, %sum2
|
|
ret i32 %sum3
|
|
}
|
|
|
|
define i32 @ld_not_invariant(ptr addrspace(1) %ptr) {
|
|
; CHECK-LABEL: ld_not_invariant(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<2>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_not_invariant_param_0];
|
|
; CHECK-NEXT: ld.global.b32 %r1, [%rd1];
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
|
; CHECK-NEXT: ret;
|
|
%a = load i32, ptr addrspace(1) %ptr
|
|
ret i32 %a
|
|
}
|
|
|
|
define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) {
|
|
; CHECK-LABEL: ld_not_global_addrspace(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<2>;
|
|
; CHECK-NEXT: .reg .b64 %rd<2>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [ld_not_global_addrspace_param_0];
|
|
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
|
; CHECK-NEXT: ret;
|
|
%a = load i32, ptr addrspace(0) %ptr
|
|
ret i32 %a
|
|
}
|
|
|
|
!0 = !{}
|