
Extend sra i64 simplification to shift constants in range [33:62]. Shift amounts 32 and 63 were already handled. New testing for shift amts 33 and 62 added in sra.ll. Changes to other test files were to adapt previous test results to this extension. --------- Signed-off-by: John Lu <John.Lu@amd.com>
904 lines
32 KiB
LLVM
904 lines
32 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
|
|
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
|
|
; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
|
|
|
|
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
|
|
define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
|
|
; SI-LABEL: test_smul24_i32:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_bfe_i32 s2, s2, 0x180000
|
|
; SI-NEXT: s_bfe_i32 s3, s3, 0x180000
|
|
; SI-NEXT: s_mul_i32 s2, s2, s3
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: v_mov_b32_e32 v0, s2
|
|
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: test_smul24_i32:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; VI-NEXT: s_mov_b32 s6, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s4, s0
|
|
; VI-NEXT: s_mov_b32 s5, s1
|
|
; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
|
|
; VI-NEXT: s_bfe_i32 s1, s3, 0x180000
|
|
; VI-NEXT: s_mul_i32 s0, s0, s1
|
|
; VI-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: test_smul24_i32:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
|
; GFX9-NEXT: s_mov_b32 s6, -1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_mov_b32 s4, s0
|
|
; GFX9-NEXT: s_mov_b32 s5, s1
|
|
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
|
|
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: test_smul24_i32:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: ALU clause starting at 4:
|
|
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
|
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: test_smul24_i32:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: ALU clause starting at 4:
|
|
; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
|
|
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
|
|
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
|
|
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
|
|
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
|
|
entry:
|
|
%a.shl = shl i32 %a, 8
|
|
%a.24 = ashr i32 %a.shl, 8
|
|
%b.shl = shl i32 %b, 8
|
|
%b.24 = ashr i32 %b.shl, 8
|
|
%mul24 = mul i32 %a.24, %b.24
|
|
store i32 %mul24, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
|
|
; SI-LABEL: test_smulhi24_i64:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: v_mov_b32_e32 v0, s3
|
|
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
|
|
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: test_smulhi24_i64:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; VI-NEXT: s_mov_b32 s6, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s3
|
|
; VI-NEXT: s_mov_b32 s4, s0
|
|
; VI-NEXT: s_mov_b32 s5, s1
|
|
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
|
|
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: test_smulhi24_i64:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
|
; GFX9-NEXT: s_mov_b32 s6, -1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_mov_b32 s4, s0
|
|
; GFX9-NEXT: s_mov_b32 s5, s1
|
|
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
|
|
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
|
|
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: test_smulhi24_i64:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: ALU clause starting at 4:
|
|
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
|
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MULHI_INT * T1.X, PS, PV.W,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: test_smulhi24_i64:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: ALU clause starting at 4:
|
|
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
|
|
; CM-NEXT: MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
|
|
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
|
|
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
|
|
entry:
|
|
%a.shl = shl i32 %a, 8
|
|
%a.24 = ashr i32 %a.shl, 8
|
|
%b.shl = shl i32 %b, 8
|
|
%b.24 = ashr i32 %b.shl, 8
|
|
%a.24.i64 = sext i32 %a.24 to i64
|
|
%b.24.i64 = sext i32 %b.24 to i64
|
|
%mul48 = mul i64 %a.24.i64, %b.24.i64
|
|
%mul48.hi = lshr i64 %mul48, 32
|
|
%mul24hi = trunc i64 %mul48.hi to i32
|
|
store i32 %mul24hi, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
|
|
; SI-LABEL: test_smul48_i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
|
|
; SI-NEXT: v_mul_i32_i24_e32 v0, v0, v2
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: test_smul48_i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
|
|
; VI-NEXT: v_mul_i32_i24_e32 v0, v0, v2
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: test_smul48_i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
|
|
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; EG-LABEL: test_smul48_i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
;
|
|
; CM-LABEL: test_smul48_i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
%shl.lhs = shl i64 %lhs, 40
|
|
%lhs24 = ashr i64 %shl.lhs, 40
|
|
%shl.rhs = shl i64 %rhs, 40
|
|
%rhs24 = ashr i64 %shl.rhs, 40
|
|
%mul = mul i64 %lhs24, %rhs24
|
|
ret i64 %mul
|
|
}
|
|
|
|
define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
|
|
; SI-LABEL: test_smul48_v2i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4
|
|
; SI-NEXT: v_mul_i32_i24_e32 v0, v0, v4
|
|
; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6
|
|
; SI-NEXT: v_mul_i32_i24_e32 v2, v2, v6
|
|
; SI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; VI-LABEL: test_smul48_v2i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4
|
|
; VI-NEXT: v_mul_i32_i24_e32 v0, v0, v4
|
|
; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6
|
|
; VI-NEXT: v_mul_i32_i24_e32 v2, v2, v6
|
|
; VI-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: test_smul48_v2i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4
|
|
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v4
|
|
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6
|
|
; GFX9-NEXT: v_mul_i32_i24_e32 v2, v2, v6
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; EG-LABEL: test_smul48_v2i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
;
|
|
; CM-LABEL: test_smul48_v2i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
%shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40>
|
|
%lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40>
|
|
%shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40>
|
|
%rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40>
|
|
%mul = mul <2 x i64> %lhs24, %rhs24
|
|
ret <2 x i64> %mul
|
|
}
|
|
|
|
; This requires handling of the original 64-bit mul node to eliminate
|
|
; unnecessary extension instructions because after legalization they
|
|
; will not be removed by SimplifyDemandedBits because there are
|
|
; multiple uses by the separate mul and mulhi.
|
|
define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
|
|
; SI-LABEL: test_smul24_i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_load_dword s6, s[4:5], 0x13
|
|
; SI-NEXT: s_load_dword s4, s[4:5], 0x1c
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_bfe_i32 s5, s6, 0x180000
|
|
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
|
|
; SI-NEXT: v_mov_b32_e32 v0, s5
|
|
; SI-NEXT: s_mul_i32 s5, s4, s5
|
|
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s4, v0
|
|
; SI-NEXT: v_mov_b32_e32 v0, s5
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: test_smul24_i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dword s6, s[4:5], 0x4c
|
|
; VI-NEXT: s_load_dword s7, s[4:5], 0x70
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_bfe_i32 s4, s6, 0x180000
|
|
; VI-NEXT: s_bfe_i32 s5, s7, 0x180000
|
|
; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0
|
|
; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0
|
|
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: test_smul24_i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4c
|
|
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x70
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX9-NEXT: s_mov_b32 s2, -1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
|
|
; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000
|
|
; GFX9-NEXT: s_mul_hi_i32 s6, s5, s4
|
|
; GFX9-NEXT: s_mul_i32 s5, s5, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s6
|
|
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: test_smul24_i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: ALU clause starting at 4:
|
|
; EG-NEXT: LSHL T0.W, KC0[4].Z, literal.x,
|
|
; EG-NEXT: LSHL * T1.W, KC0[6].W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
|
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PS,
|
|
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MULLO_INT * T0.X, T1.W, T0.W,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: test_smul24_i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: ALU clause starting at 4:
|
|
; CM-NEXT: LSHL T0.Z, KC0[4].Z, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, KC0[6].W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
|
|
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: MULLO_INT T1.X, T1.Z, T0.W,
|
|
; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
|
|
; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
|
|
; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
|
|
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
|
|
; CM-NEXT: MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
|
|
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
|
|
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
|
|
%shl.i = shl i32 %a, 8
|
|
%shr.i = ashr i32 %shl.i, 8
|
|
%conv.i = sext i32 %shr.i to i64
|
|
%shl1.i = shl i32 %b, 8
|
|
%shr2.i = ashr i32 %shl1.i, 8
|
|
%conv3.i = sext i32 %shr2.i to i64
|
|
%mul.i = mul i64 %conv3.i, %conv.i
|
|
store i64 %mul.i, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
|
|
; SI-LABEL: test_smul24_i64_square:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_bfe_i32 s4, s6, 0x180000
|
|
; SI-NEXT: s_mul_i32 s5, s4, s4
|
|
; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
|
|
; SI-NEXT: v_mov_b32_e32 v0, s5
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: test_smul24_i64_square:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_bfe_i32 s4, s6, 0x180000
|
|
; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
|
|
; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4
|
|
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: test_smul24_i64_square:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX9-NEXT: s_mov_b32 s2, -1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
|
|
; GFX9-NEXT: s_mul_hi_i32 s5, s4, s4
|
|
; GFX9-NEXT: s_mul_i32 s4, s4, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: test_smul24_i64_square:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: ALU clause starting at 4:
|
|
; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PV.W,
|
|
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.W,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: test_smul24_i64_square:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: ALU clause starting at 4:
|
|
; CM-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.W,
|
|
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.W,
|
|
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.W,
|
|
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.W,
|
|
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
|
|
; CM-NEXT: MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
|
|
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
|
|
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
|
|
%shl.i = shl i32 %a, 8
|
|
%shr.i = ashr i32 %shl.i, 8
|
|
%conv.i = sext i32 %shr.i to i64
|
|
%mul.i = mul i64 %conv.i, %conv.i
|
|
store i64 %mul.i, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
|
|
; SI-LABEL: test_smul24_i33:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: s_bfe_i32 s0, s8, 0x180000
|
|
; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: s_mul_i32 s0, s1, s0
|
|
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
|
|
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: test_smul24_i33:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
|
; VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; VI-NEXT: s_mov_b32 s6, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_bfe_i32 s2, s2, 0x180000
|
|
; VI-NEXT: s_bfe_i32 s3, s4, 0x180000
|
|
; VI-NEXT: v_mov_b32_e32 v0, s3
|
|
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
|
|
; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
|
|
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
|
|
; VI-NEXT: s_mov_b32 s4, s0
|
|
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
|
|
; VI-NEXT: s_mov_b32 s5, s1
|
|
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: test_smul24_i33:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX9-NEXT: s_mov_b32 s2, -1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
|
|
; GFX9-NEXT: s_bfe_i32 s6, s7, 0x180000
|
|
; GFX9-NEXT: s_mul_hi_i32 s5, s4, s6
|
|
; GFX9-NEXT: s_mul_i32 s4, s4, s6
|
|
; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 31
|
|
; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 31
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: test_smul24_i33:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: ALU clause starting at 4:
|
|
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
|
|
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
|
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
|
|
; EG-NEXT: MULLO_INT * T1.X, T0.W, T1.W,
|
|
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: test_smul24_i33:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 16, @4, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: ALU clause starting at 4:
|
|
; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
|
|
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
|
|
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
|
|
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
|
|
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
|
|
; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%a.shl = shl i33 %a, 9
|
|
%a.24 = ashr i33 %a.shl, 9
|
|
%b.shl = shl i33 %b, 9
|
|
%b.24 = ashr i33 %b.shl, 9
|
|
%mul24 = mul i33 %a.24, %b.24
|
|
%ext = sext i33 %mul24 to i64
|
|
store i64 %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
|
|
; SI-LABEL: test_smulhi24_i33:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: v_mov_b32_e32 v0, s8
|
|
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
|
|
; SI-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: test_smulhi24_i33:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
|
|
; VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; VI-NEXT: s_mov_b32 s6, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_mov_b32 s4, s0
|
|
; VI-NEXT: v_mov_b32_e32 v0, s8
|
|
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
|
|
; VI-NEXT: s_mov_b32 s5, s1
|
|
; VI-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: test_smulhi24_i33:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX9-NEXT: s_mov_b32 s2, -1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000
|
|
; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000
|
|
; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5
|
|
; GFX9-NEXT: s_and_b32 s4, s4, 1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: test_smulhi24_i33:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: ALU clause starting at 4:
|
|
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
|
|
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
|
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
|
|
; EG-NEXT: AND_INT T0.X, PS, 1,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: test_smulhi24_i33:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: ALU clause starting at 4:
|
|
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
|
|
; CM-NEXT: AND_INT * T0.X, PV.X, 1,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%tmp0 = shl i33 %a, 9
|
|
%a_24 = ashr i33 %tmp0, 9
|
|
%tmp1 = shl i33 %b, 9
|
|
%b_24 = ashr i33 %tmp1, 9
|
|
%tmp2 = mul i33 %a_24, %b_24
|
|
%hi = lshr i33 %tmp2, 32
|
|
%trunc = trunc i33 %hi to i32
|
|
|
|
store i32 %trunc, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
|
|
; SI-LABEL: simplify_i24_crash:
|
|
; SI: ; %bb.0: ; %bb
|
|
; SI-NEXT: s_load_dword s0, s[4:5], 0xb
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_cmp_lg_u32 s0, 0
|
|
; SI-NEXT: s_cbranch_scc0 .LBB8_2
|
|
; SI-NEXT: ; %bb.1: ; %bb7
|
|
; SI-NEXT: s_endpgm
|
|
; SI-NEXT: .LBB8_2: ; %bb11
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd
|
|
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_bfe_i32 s0, s0, 0x180000
|
|
; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
|
|
; SI-NEXT: s_mul_i32 s0, s0, s1
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: v_mov_b32_e32 v1, s0
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: simplify_i24_crash:
|
|
; VI: ; %bb.0: ; %bb
|
|
; VI-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_cmp_lg_u32 s0, 0
|
|
; VI-NEXT: s_cbranch_scc0 .LBB8_2
|
|
; VI-NEXT: ; %bb.1: ; %bb7
|
|
; VI-NEXT: s_endpgm
|
|
; VI-NEXT: .LBB8_2: ; %bb11
|
|
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; VI-NEXT: s_mov_b32 s6, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
|
|
; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
|
|
; VI-NEXT: s_mul_i32 s0, s0, s1
|
|
; VI-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s0
|
|
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: simplify_i24_crash:
|
|
; GFX9: ; %bb.0: ; %bb
|
|
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
|
|
; GFX9-NEXT: s_cbranch_scc0 .LBB8_2
|
|
; GFX9-NEXT: ; %bb.1: ; %bb7
|
|
; GFX9-NEXT: s_endpgm
|
|
; GFX9-NEXT: .LBB8_2: ; %bb11
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b32 s11, 0xf000
|
|
; GFX9-NEXT: s_mov_b32 s10, -1
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_bfe_i32 s0, s0, 0x180000
|
|
; GFX9-NEXT: s_bfe_i32 s1, s2, 0x180000
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: simplify_i24_crash:
|
|
; EG: ; %bb.0: ; %bb
|
|
; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: JUMP @5 POP:1
|
|
; EG-NEXT: ALU 12, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
|
|
; EG-NEXT: POP @5 POP:1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: ALU clause starting at 6:
|
|
; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
|
|
; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV T0.W, KC0[3].Y,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].W,
|
|
; EG-NEXT: LSHL T1.W, PS, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T0.W, PS, literal.x,
|
|
; EG-NEXT: ASHR * T1.W, PV.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T2.W, KC0[2].Y,
|
|
; EG-NEXT: MULLO_INT * T0.X, PS, PV.W,
|
|
; EG-NEXT: LSHR T1.X, PV.W, literal.x,
|
|
; EG-NEXT: MOV * T0.Y, PS,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: simplify_i24_crash:
|
|
; CM: ; %bb.0: ; %bb
|
|
; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: JUMP @5 POP:1
|
|
; CM-NEXT: ALU 15, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
|
; CM-NEXT: POP @5 POP:1
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: ALU clause starting at 6:
|
|
; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
|
|
; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV T0.Z, KC0[3].Y,
|
|
; CM-NEXT: MOV * T0.W, KC0[2].W,
|
|
; CM-NEXT: LSHL T1.Z, PV.W, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T0.Y, KC0[2].Y,
|
|
; CM-NEXT: ASHR T0.Z, PV.W, literal.x,
|
|
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: MULLO_INT T0.X, T0.W, T0.Z,
|
|
; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T0.Z,
|
|
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T0.Z,
|
|
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T0.Z,
|
|
; CM-NEXT: LSHR T1.X, T0.Y, literal.x,
|
|
; CM-NEXT: MOV * T0.Y, PV.X,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
bb:
|
|
%cmp = icmp eq i32 %arg0, 0
|
|
br i1 %cmp, label %bb11, label %bb7
|
|
|
|
bb11:
|
|
%tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> poison, <2 x i32> zeroinitializer
|
|
%tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> poison, <2 x i32> zeroinitializer
|
|
%tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8>
|
|
%tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8>
|
|
%tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8>
|
|
%tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8>
|
|
%tmp21 = mul <2 x i32> %tmp18, %tmp20
|
|
store <2 x i32> %tmp21, ptr addrspace(1) %out
|
|
br label %bb7
|
|
|
|
bb7:
|
|
ret void
|
|
|
|
}
|
|
|
|
define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
|
|
; SI-LABEL: test_umul_i24:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s2, s[4:5], 0xb
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0xff803fe1
|
|
; SI-NEXT: s_mov_b64 s[0:1], 0
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_lshr_b32 s2, s2, 9
|
|
; SI-NEXT: v_mul_hi_u32 v0, s2, v0
|
|
; SI-NEXT: s_mul_i32 s2, s2, 0xff803fe1
|
|
; SI-NEXT: v_alignbit_b32 v0, v0, s2, 1
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: test_umul_i24:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dword s0, s[4:5], 0x2c
|
|
; VI-NEXT: v_mov_b32_e32 v0, 0xff803fe1
|
|
; VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: s_lshr_b32 s0, s0, 9
|
|
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0
|
|
; VI-NEXT: s_mov_b64 s[0:1], 0
|
|
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 1
|
|
; VI-NEXT: s_nop 2
|
|
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: test_umul_i24:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshr_b32 s2, s2, 9
|
|
; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0xff803fe1
|
|
; GFX9-NEXT: s_mul_i32 s2, s2, 0xff803fe1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
|
|
; GFX9-NEXT: s_mov_b32 s2, -1
|
|
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: test_umul_i24:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: ALU clause starting at 4:
|
|
; EG-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
|
|
; EG-NEXT: MULHI * T0.X, PV.W, literal.x,
|
|
; EG-NEXT: -8372255(nan), 0(0.000000e+00)
|
|
; EG-NEXT: MULLO_INT * T0.Y, T0.W, literal.x,
|
|
; EG-NEXT: -8372255(nan), 0(0.000000e+00)
|
|
; EG-NEXT: BIT_ALIGN_INT T0.X, T0.X, PS, 1,
|
|
; EG-NEXT: MOV * T1.X, literal.x,
|
|
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: test_umul_i24:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: ALU clause starting at 4:
|
|
; CM-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
|
|
; CM-NEXT: 9(1.261169e-44), 0(0.000000e+00)
|
|
; CM-NEXT: MULHI T0.X, T0.W, literal.x,
|
|
; CM-NEXT: MULHI T0.Y (MASKED), T0.W, literal.x,
|
|
; CM-NEXT: MULHI T0.Z (MASKED), T0.W, literal.x,
|
|
; CM-NEXT: MULHI * T0.W (MASKED), T0.W, literal.x,
|
|
; CM-NEXT: -8372255(nan), 0(0.000000e+00)
|
|
; CM-NEXT: MULLO_INT T0.X (MASKED), T0.W, literal.x,
|
|
; CM-NEXT: MULLO_INT T0.Y, T0.W, literal.x,
|
|
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, literal.x,
|
|
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, literal.x,
|
|
; CM-NEXT: -8372255(nan), 0(0.000000e+00)
|
|
; CM-NEXT: BIT_ALIGN_INT * T0.X, T0.X, PV.Y, 1,
|
|
; CM-NEXT: MOV * T1.X, literal.x,
|
|
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
|
|
%i = lshr i32 %arg, 9
|
|
%i1 = zext i32 %i to i64
|
|
%i2 = mul i64 %i1, 4286595041
|
|
%i3 = lshr i64 %i2, 1
|
|
%i4 = trunc i64 %i3 to i32
|
|
store i32 %i4, ptr addrspace(1) null, align 4
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|