154 lines
6.7 KiB
LLVM
154 lines
6.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
|
|
|
|
;
|
|
; kernel void combine_vloads(global char8 addrspace(5)* src, global char8 addrspace(5)* result) {
|
|
; for (int i = 0; i < 1024; ++i)
|
|
; result[i] = src[0] + src[1] + src[2] + src[3];
|
|
; }
|
|
;
|
|
|
|
|
|
; 128-bit loads instead of many 8-bit
|
|
define amdgpu_kernel void @combine_vloads(ptr addrspace(1) nocapture %src, ptr addrspace(1) nocapture %result) nounwind {
|
|
; EG-LABEL: combine_vloads:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 3, @16, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LOOP_START_DX10 @10
|
|
; EG-NEXT: TEX 1 @12
|
|
; EG-NEXT: ALU 86, @20, KC0[], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XY, T15.X, 0
|
|
; EG-NEXT: ALU_PUSH_BEFORE 4, @107, KC0[], KC1[]
|
|
; EG-NEXT: JUMP @9 POP:1
|
|
; EG-NEXT: LOOP_BREAK @9
|
|
; EG-NEXT: POP @9 POP:1
|
|
; EG-NEXT: END_LOOP @2
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 12:
|
|
; EG-NEXT: VTX_READ_128 T14.XYZW, T13.X, 0, #1
|
|
; EG-NEXT: VTX_READ_128 T15.XYZW, T13.X, 16, #1
|
|
; EG-NEXT: ALU clause starting at 16:
|
|
; EG-NEXT: MOV T13.X, KC0[2].Y,
|
|
; EG-NEXT: MOV T0.W, KC0[2].Z,
|
|
; EG-NEXT: MOV * T1.W, literal.x,
|
|
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 20:
|
|
; EG-NEXT: LSHR T2.W, T14.Y, literal.x,
|
|
; EG-NEXT: LSHR * T3.W, T14.W, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT T2.W, PV.W, PS,
|
|
; EG-NEXT: LSHR * T3.W, T15.Y, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T0.Y, T14.Y, literal.x,
|
|
; EG-NEXT: LSHR T0.Z, T14.W, literal.x,
|
|
; EG-NEXT: ADD_INT T2.W, PV.W, PS,
|
|
; EG-NEXT: LSHR * T3.W, T15.W, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: ADD_INT T16.X, PV.W, PS,
|
|
; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.Z,
|
|
; EG-NEXT: LSHR T0.Z, T15.Y, literal.x,
|
|
; EG-NEXT: LSHR T2.W, T14.X, literal.y,
|
|
; EG-NEXT: LSHR * T3.W, T14.Z, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: ADD_INT T17.X, PV.W, PS,
|
|
; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.Z,
|
|
; EG-NEXT: LSHR T0.Z, T15.W, literal.x,
|
|
; EG-NEXT: LSHR T2.W, T14.Y, literal.y,
|
|
; EG-NEXT: LSHR * T3.W, T14.W, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T18.X, T15.X, literal.x,
|
|
; EG-NEXT: LSHR T1.Y, T14.X, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT T1.Z, PV.W, PS,
|
|
; EG-NEXT: LSHR T2.W, T15.Y, literal.z,
|
|
; EG-NEXT: ADD_INT * T3.W, PV.Y, PV.Z,
|
|
; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T19.X, T14.Z, literal.x,
|
|
; EG-NEXT: ADD_INT T0.Y, T14.Y, T14.W,
|
|
; EG-NEXT: AND_INT T0.Z, PS, literal.y,
|
|
; EG-NEXT: ADD_INT T2.W, PV.Z, PV.W,
|
|
; EG-NEXT: LSHR * T3.W, T15.W, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT T20.X, PV.W, PS,
|
|
; EG-NEXT: LSHL T2.Y, PV.Z, literal.x,
|
|
; EG-NEXT: ADD_INT T0.Z, PV.Y, T15.Y,
|
|
; EG-NEXT: ADD_INT T2.W, T1.Y, PV.X,
|
|
; EG-NEXT: LSHR * T3.W, T15.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT T19.X, T14.X, T14.Z,
|
|
; EG-NEXT: ADD_INT T0.Y, PV.W, PS,
|
|
; EG-NEXT: LSHR T1.Z, T15.Z, literal.x,
|
|
; EG-NEXT: LSHR T2.W, T14.X, literal.y,
|
|
; EG-NEXT: LSHR * T3.W, T14.Z, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: ADD_INT T14.X, PV.W, PS,
|
|
; EG-NEXT: LSHR T1.Y, T15.X, literal.x,
|
|
; EG-NEXT: ADD_INT T1.Z, PV.Y, PV.Z,
|
|
; EG-NEXT: ADD_INT T2.W, PV.X, T15.X,
|
|
; EG-NEXT: ADD_INT * T3.W, T0.Z, T15.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T15.X, PS, literal.x,
|
|
; EG-NEXT: ADD_INT T0.Y, PV.W, T15.Z,
|
|
; EG-NEXT: AND_INT T0.Z, PV.Z, literal.x,
|
|
; EG-NEXT: ADD_INT T2.W, PV.X, PV.Y,
|
|
; EG-NEXT: LSHR * T3.W, T15.Z, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
|
|
; EG-NEXT: ADD_INT T14.X, PV.W, PS,
|
|
; EG-NEXT: LSHL T1.Y, PV.Z, literal.x,
|
|
; EG-NEXT: AND_INT T0.Z, PV.Y, literal.y,
|
|
; EG-NEXT: OR_INT T2.W, PV.X, T2.Y,
|
|
; EG-NEXT: LSHL * T3.W, T20.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT T15.X, PV.W, PS,
|
|
; EG-NEXT: OR_INT T0.Y, PV.Z, PV.Y,
|
|
; EG-NEXT: LSHL T0.Z, PV.X, literal.x,
|
|
; EG-NEXT: ADD_INT T2.W, T17.X, T18.X,
|
|
; EG-NEXT: LSHR * T3.W, T15.Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: ADD_INT T1.Y, PV.W, PS,
|
|
; EG-NEXT: OR_INT T0.Z, PV.Y, PV.Z,
|
|
; EG-NEXT: AND_INT T2.W, PV.X, literal.x,
|
|
; EG-NEXT: LSHL * T3.W, T16.X, literal.y,
|
|
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
|
|
; EG-NEXT: OR_INT T14.Y, PV.W, PS,
|
|
; EG-NEXT: AND_INT T2.W, PV.Z, literal.x,
|
|
; EG-NEXT: LSHL * T3.W, PV.Y, literal.y,
|
|
; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
|
|
; EG-NEXT: OR_INT T14.X, PV.W, PS,
|
|
; EG-NEXT: ADD_INT * T2.W, T0.W, T1.W,
|
|
; EG-NEXT: LSHR * T15.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 107:
|
|
; EG-NEXT: ADD_INT * T1.W, T1.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: SETE_INT * T2.W, PV.W, literal.x,
|
|
; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00)
|
|
; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.exit: ; preds = %for.body
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
|
|
%vecload2 = load <8 x i32>, ptr addrspace(1) %src, align 32
|
|
%0 = bitcast <8 x i32> %vecload2 to <32 x i8>
|
|
%tmp5 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
%tmp8 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
|
%tmp9 = add nsw <8 x i8> %tmp5, %tmp8
|
|
%tmp12 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
|
|
%tmp13 = add nsw <8 x i8> %tmp9, %tmp12
|
|
%tmp16 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
|
|
%tmp17 = add nsw <8 x i8> %tmp13, %tmp16
|
|
%scevgep = getelementptr <8 x i8>, ptr addrspace(1) %result, i32 %i.01
|
|
%1 = bitcast <8 x i8> %tmp17 to <2 x i32>
|
|
store <2 x i32> %1, ptr addrspace(1) %scevgep, align 8
|
|
%tmp19 = add nsw i32 %i.01, 1
|
|
%exitcond = icmp eq i32 %tmp19, 1024
|
|
br i1 %exitcond, label %for.exit, label %for.body
|
|
}
|