Use the default, which freely coalesces anything it can. This mostly shows improvements, with a handful of regressions. The main concern would be if introducing wider registers is more likely to push the register usage up to the next occupancy tier.
1260 lines
48 KiB
LLVM
1260 lines
48 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,SI %s
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefixes=GCN,CI %s
|
|
|
|
; This test is mostly to test DAG store merging, so disable the vectorizer.
|
|
; Run with devices with different unaligned load restrictions.
|
|
|
|
; TODO: Vector element tests
|
|
; TODO: Non-zero base offset for load and store combinations
|
|
; TODO: Same base addrspacecasted
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_i8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x7bc8
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
|
|
|
|
store i8 123, ptr addrspace(1) %out.gep.1
|
|
store i8 456, ptr addrspace(1) %out, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_i8_natural_align:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0xc8
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1
|
|
; GCN-NEXT: buffer_store_byte v1, off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
|
|
|
|
store i8 123, ptr addrspace(1) %out.gep.1
|
|
store i8 456, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_i16:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x7b01c8
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
|
|
|
|
store i16 123, ptr addrspace(1) %out.gep.1
|
|
store i16 456, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_0_i16:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
|
|
|
|
store i16 0, ptr addrspace(1) %out.gep.1
|
|
store i16 0, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_i16_natural_align:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0x1c8
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2
|
|
; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
|
|
|
|
store i16 123, ptr addrspace(1) %out.gep.1
|
|
store i16 456, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x1c8
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
|
|
store i32 123, ptr addrspace(1) %out.gep.1
|
|
store i32 456, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_i32_f32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x1c8
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
store float 1.0, ptr addrspace(1) %out.gep.1
|
|
store i32 456, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_f32_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 4.0
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
|
|
store i32 123, ptr addrspace(1) %out.gep.1
|
|
store float 4.0, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_4_constants_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x4d2
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0x1c8
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 0x14d
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
|
|
|
|
store i32 123, ptr addrspace(1) %out.gep.1
|
|
store i32 456, ptr addrspace(1) %out.gep.2
|
|
store i32 333, ptr addrspace(1) %out.gep.3
|
|
store i32 1234, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_4_constants_f32_order:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x41000000
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 2.0
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 4.0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
|
|
|
|
store float 8.0, ptr addrspace(1) %out
|
|
store float 1.0, ptr addrspace(1) %out.gep.1
|
|
store float 2.0, ptr addrspace(1) %out.gep.2
|
|
store float 4.0, ptr addrspace(1) %out.gep.3
|
|
ret void
|
|
}
|
|
|
|
; First store is out of order.
|
|
define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_4_constants_f32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x41000000
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 2.0
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 4.0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
|
|
|
|
store float 1.0, ptr addrspace(1) %out.gep.1
|
|
store float 2.0, ptr addrspace(1) %out.gep.2
|
|
store float 4.0, ptr addrspace(1) %out.gep.3
|
|
store float 8.0, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_4_constants_mixed_i32_f32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x41000000
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 11
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 2.0
|
|
; GCN-NEXT: v_mov_b32_e32 v3, 17
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
|
|
|
|
|
|
store i32 11, ptr addrspace(1) %out.gep.1
|
|
store float 2.0, ptr addrspace(1) %out.gep.2
|
|
store i32 17, ptr addrspace(1) %out.gep.3
|
|
store float 8.0, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
|
|
; SI-LABEL: merge_global_store_3_constants_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x1c8
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0x4d2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_3_constants_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x1c8
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x4d2
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
|
|
; CI-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
|
|
store i32 123, ptr addrspace(1) %out.gep.1
|
|
store i32 456, ptr addrspace(1) %out.gep.2
|
|
store i32 1234, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
|
|
; GCN-LABEL: merge_global_store_2_constants_i64:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x1c8
|
|
; GCN-NEXT: v_mov_b32_e32 v2, 0x7b
|
|
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
|
|
|
|
store i64 123, ptr addrspace(1) %out.gep.1
|
|
store i64 456, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
|
|
; SI-LABEL: merge_global_store_4_constants_i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0x1c8
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0x14d
|
|
; SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0x4d2
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0x7b
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_4_constants_i64:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x1c8
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x14d
|
|
; CI-NEXT: v_mov_b32_e32 v3, v1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; CI-NEXT: s_nop 0
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x4d2
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x7b
|
|
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; CI-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
|
|
%out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
|
|
%out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
|
|
|
|
store i64 123, ptr addrspace(1) %out.gep.1
|
|
store i64 456, ptr addrspace(1) %out.gep.2
|
|
store i64 333, ptr addrspace(1) %out.gep.3
|
|
store i64 1234, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_2_adjacent_loads_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
|
|
|
|
%lo = load i32, ptr addrspace(1) %in
|
|
%hi = load i32, ptr addrspace(1) %in.gep.1
|
|
|
|
store i32 %lo, ptr addrspace(1) %out
|
|
store i32 %hi, ptr addrspace(1) %out.gep.1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_2_adjacent_loads_i32_nonzero_base:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s4, s2
|
|
; GCN-NEXT: s_mov_b32 s5, s3
|
|
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:8
|
|
; GCN-NEXT: s_mov_b32 s2, s6
|
|
; GCN-NEXT: s_mov_b32 s3, s7
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:8
|
|
; GCN-NEXT: s_endpgm
|
|
%in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
|
|
|
|
%out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3
|
|
%lo = load i32, ptr addrspace(1) %in.gep.0
|
|
%hi = load i32, ptr addrspace(1) %in.gep.1
|
|
|
|
store i32 %lo, ptr addrspace(1) %out.gep.0
|
|
store i32 %hi, ptr addrspace(1) %out.gep.1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_2_adjacent_loads_shuffle_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v2, v0
|
|
; GCN-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
|
|
|
|
%lo = load i32, ptr addrspace(1) %in
|
|
%hi = load i32, ptr addrspace(1) %in.gep.1
|
|
|
|
store i32 %hi, ptr addrspace(1) %out
|
|
store i32 %lo, ptr addrspace(1) %out.gep.1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_4_adjacent_loads_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
|
|
%in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
|
|
%in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
|
|
|
|
%x = load i32, ptr addrspace(1) %in
|
|
%y = load i32, ptr addrspace(1) %in.gep.1
|
|
%z = load i32, ptr addrspace(1) %in.gep.2
|
|
%w = load i32, ptr addrspace(1) %in.gep.3
|
|
|
|
store i32 %x, ptr addrspace(1) %out
|
|
store i32 %y, ptr addrspace(1) %out.gep.1
|
|
store i32 %z, ptr addrspace(1) %out.gep.2
|
|
store i32 %w, ptr addrspace(1) %out.gep.3
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; SI-LABEL: merge_global_store_3_adjacent_loads_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_mov_b32 s10, s6
|
|
; SI-NEXT: s_mov_b32 s11, s7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s2
|
|
; SI-NEXT: s_mov_b32 s9, s3
|
|
; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8
|
|
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_3_adjacent_loads_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_mov_b32 s10, s6
|
|
; CI-NEXT: s_mov_b32 s11, s7
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_mov_b32 s8, s2
|
|
; CI-NEXT: s_mov_b32 s9, s3
|
|
; CI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_waitcnt vmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
|
|
%in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
|
|
|
|
%x = load i32, ptr addrspace(1) %in
|
|
%y = load i32, ptr addrspace(1) %in.gep.1
|
|
%z = load i32, ptr addrspace(1) %in.gep.2
|
|
|
|
store i32 %x, ptr addrspace(1) %out
|
|
store i32 %y, ptr addrspace(1) %out.gep.1
|
|
store i32 %z, ptr addrspace(1) %out.gep.2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_4_adjacent_loads_f32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
|
|
%in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1
|
|
%in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2
|
|
%in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3
|
|
|
|
%x = load float, ptr addrspace(1) %in
|
|
%y = load float, ptr addrspace(1) %in.gep.1
|
|
%z = load float, ptr addrspace(1) %in.gep.2
|
|
%w = load float, ptr addrspace(1) %in.gep.3
|
|
|
|
store float %x, ptr addrspace(1) %out
|
|
store float %y, ptr addrspace(1) %out.gep.1
|
|
store float %z, ptr addrspace(1) %out.gep.2
|
|
store float %w, ptr addrspace(1) %out.gep.3
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_4_adjacent_loads_i32_nonzero_base:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s4, s2
|
|
; GCN-NEXT: s_mov_b32 s5, s3
|
|
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:44
|
|
; GCN-NEXT: s_mov_b32 s2, s6
|
|
; GCN-NEXT: s_mov_b32 s3, s7
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:28
|
|
; GCN-NEXT: s_endpgm
|
|
%in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
|
|
%in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
|
|
%in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14
|
|
%out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9
|
|
%out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10
|
|
|
|
%x = load i32, ptr addrspace(1) %in.gep.0
|
|
%y = load i32, ptr addrspace(1) %in.gep.1
|
|
%z = load i32, ptr addrspace(1) %in.gep.2
|
|
%w = load i32, ptr addrspace(1) %in.gep.3
|
|
|
|
store i32 %x, ptr addrspace(1) %out.gep.0
|
|
store i32 %y, ptr addrspace(1) %out.gep.1
|
|
store i32 %z, ptr addrspace(1) %out.gep.2
|
|
store i32 %w, ptr addrspace(1) %out.gep.3
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_4_adjacent_loads_inverse_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: s_barrier
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
|
|
%in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
|
|
%in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
|
|
|
|
%x = load i32, ptr addrspace(1) %in
|
|
%y = load i32, ptr addrspace(1) %in.gep.1
|
|
%z = load i32, ptr addrspace(1) %in.gep.2
|
|
%w = load i32, ptr addrspace(1) %in.gep.3
|
|
|
|
; Make sure the barrier doesn't stop this
|
|
tail call void @llvm.amdgcn.s.barrier() #1
|
|
|
|
store i32 %w, ptr addrspace(1) %out.gep.3
|
|
store i32 %z, ptr addrspace(1) %out.gep.2
|
|
store i32 %y, ptr addrspace(1) %out.gep.1
|
|
store i32 %x, ptr addrspace(1) %out
|
|
|
|
ret void
|
|
}
|
|
|
|
; TODO: Re-packing of loaded register required. Maybe an IR pass
|
|
; should catch this?
|
|
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; SI-LABEL: merge_global_store_4_adjacent_loads_shuffle_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_mov_b32 s10, s6
|
|
; SI-NEXT: s_mov_b32 s11, s7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s2
|
|
; SI-NEXT: s_mov_b32 s9, s3
|
|
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: s_barrier
|
|
; SI-NEXT: v_mov_b32_e32 v4, v2
|
|
; SI-NEXT: v_mov_b32_e32 v2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v6, v2
|
|
; SI-NEXT: v_mov_b32_e32 v5, v1
|
|
; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_4_adjacent_loads_shuffle_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_mov_b32 s10, s6
|
|
; CI-NEXT: s_mov_b32 s11, s7
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_mov_b32 s8, s2
|
|
; CI-NEXT: s_mov_b32 s9, s3
|
|
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_waitcnt vmcnt(0)
|
|
; CI-NEXT: s_barrier
|
|
; CI-NEXT: v_mov_b32_e32 v4, v2
|
|
; CI-NEXT: v_mov_b32_e32 v5, v1
|
|
; CI-NEXT: v_mov_b32_e32 v6, v0
|
|
; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
|
|
%in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
|
|
%in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
|
|
%in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
|
|
|
|
%x = load i32, ptr addrspace(1) %in
|
|
%y = load i32, ptr addrspace(1) %in.gep.1
|
|
%z = load i32, ptr addrspace(1) %in.gep.2
|
|
%w = load i32, ptr addrspace(1) %in.gep.3
|
|
|
|
; Make sure the barrier doesn't stop this
|
|
tail call void @llvm.amdgcn.s.barrier() #1
|
|
|
|
store i32 %w, ptr addrspace(1) %out
|
|
store i32 %z, ptr addrspace(1) %out.gep.1
|
|
store i32 %y, ptr addrspace(1) %out.gep.2
|
|
store i32 %x, ptr addrspace(1) %out.gep.3
|
|
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_4_adjacent_loads_i8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
|
|
%out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
|
|
%out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
|
|
%in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
|
|
%in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
|
|
%in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
|
|
|
|
%x = load i8, ptr addrspace(1) %in, align 4
|
|
%y = load i8, ptr addrspace(1) %in.gep.1
|
|
%z = load i8, ptr addrspace(1) %in.gep.2
|
|
%w = load i8, ptr addrspace(1) %in.gep.3
|
|
|
|
store i8 %x, ptr addrspace(1) %out, align 4
|
|
store i8 %y, ptr addrspace(1) %out.gep.1
|
|
store i8 %z, ptr addrspace(1) %out.gep.2
|
|
store i8 %w, ptr addrspace(1) %out.gep.3
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_4_adjacent_loads_i8_natural_align:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1
|
|
; GCN-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
|
|
; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
|
; GCN-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1
|
|
; GCN-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2
|
|
; GCN-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:3
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
|
|
%out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
|
|
%out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
|
|
%in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
|
|
%in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
|
|
%in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
|
|
|
|
%x = load i8, ptr addrspace(1) %in
|
|
%y = load i8, ptr addrspace(1) %in.gep.1
|
|
%z = load i8, ptr addrspace(1) %in.gep.2
|
|
%w = load i8, ptr addrspace(1) %in.gep.3
|
|
|
|
store i8 %x, ptr addrspace(1) %out
|
|
store i8 %y, ptr addrspace(1) %out.gep.1
|
|
store i8 %z, ptr addrspace(1) %out.gep.2
|
|
store i8 %w, ptr addrspace(1) %out.gep.3
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-LABEL: merge_global_store_4_vector_elts_loads_v4i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
|
|
%out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
|
|
%vec = load <4 x i32>, ptr addrspace(1) %in
|
|
|
|
%x = extractelement <4 x i32> %vec, i32 0
|
|
%y = extractelement <4 x i32> %vec, i32 1
|
|
%z = extractelement <4 x i32> %vec, i32 2
|
|
%w = extractelement <4 x i32> %vec, i32 3
|
|
|
|
store i32 %x, ptr addrspace(1) %out
|
|
store i32 %y, ptr addrspace(1) %out.gep.1
|
|
store i32 %z, ptr addrspace(1) %out.gep.2
|
|
store i32 %w, ptr addrspace(1) %out.gep.3
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
|
|
; GCN-LABEL: merge_local_store_2_constants_i8:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
|
|
; GCN-NEXT: v_mov_b32_e32 v0, 0x7bc8
|
|
; GCN-NEXT: s_mov_b32 m0, -1
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v1, s0
|
|
; GCN-NEXT: ds_write_b16 v1, v0
|
|
; GCN-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
|
|
|
|
store i8 123, ptr addrspace(3) %out.gep.1
|
|
store i8 456, ptr addrspace(3) %out, align 2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
|
|
; SI-LABEL: merge_local_store_2_constants_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0x7b
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x1c8
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: ds_write2_b32 v2, v1, v0 offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_local_store_2_constants_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dword s0, s[4:5], 0x9
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x1c8
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_mov_b32_e32 v2, s0
|
|
; CI-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
|
|
; CI-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
|
|
|
|
store i32 123, ptr addrspace(3) %out.gep.1
|
|
store i32 456, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
|
|
; SI-LABEL: merge_local_store_4_constants_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dword s0, s[4:5], 0x9
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x1c8
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0x14d
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0x7b
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v3, s0
|
|
; SI-NEXT: ds_write2_b32 v3, v1, v2 offset0:2 offset1:3
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x4d2
|
|
; SI-NEXT: ds_write2_b32 v3, v1, v0 offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_local_store_4_constants_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dword s0, s[4:5], 0x9
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x1c8
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0x14d
|
|
; CI-NEXT: s_mov_b32 m0, -1
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x4d2
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: v_mov_b32_e32 v3, s0
|
|
; CI-NEXT: ds_write2_b32 v3, v0, v1 offset0:2 offset1:3
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
|
|
; CI-NEXT: ds_write2_b32 v3, v2, v0 offset1:1
|
|
; CI-NEXT: s_endpgm
|
|
%out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
|
|
%out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
|
|
%out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
|
|
|
|
store i32 123, ptr addrspace(3) %out.gep.1
|
|
store i32 456, ptr addrspace(3) %out.gep.2
|
|
store i32 333, ptr addrspace(3) %out.gep.3
|
|
store i32 1234, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
|
|
; SI-LABEL: merge_global_store_5_constants_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: v_mov_b32_e32 v0, 9
|
|
; SI-NEXT: v_mov_b32_e32 v1, 12
|
|
; SI-NEXT: v_mov_b32_e32 v2, 16
|
|
; SI-NEXT: v_mov_b32_e32 v3, -12
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, 11
|
|
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_5_constants_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: v_mov_b32_e32 v0, 9
|
|
; CI-NEXT: v_mov_b32_e32 v1, 12
|
|
; CI-NEXT: v_mov_b32_e32 v2, 16
|
|
; CI-NEXT: v_mov_b32_e32 v3, -12
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; CI-NEXT: s_nop 0
|
|
; CI-NEXT: v_mov_b32_e32 v0, 11
|
|
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
|
|
; CI-NEXT: s_endpgm
|
|
store i32 9, ptr addrspace(1) %out, align 4
|
|
%idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
|
|
store i32 12, ptr addrspace(1) %idx1, align 4
|
|
%idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
|
|
store i32 16, ptr addrspace(1) %idx2, align 4
|
|
%idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
|
|
store i32 -12, ptr addrspace(1) %idx3, align 4
|
|
%idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
|
|
store i32 11, ptr addrspace(1) %idx4, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
|
|
; SI-LABEL: merge_global_store_6_constants_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: v_mov_b32_e32 v0, 13
|
|
; SI-NEXT: v_mov_b32_e32 v1, 15
|
|
; SI-NEXT: v_mov_b32_e32 v2, 62
|
|
; SI-NEXT: v_mov_b32_e32 v3, 63
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, 11
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_6_constants_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: v_mov_b32_e32 v0, 13
|
|
; CI-NEXT: v_mov_b32_e32 v1, 15
|
|
; CI-NEXT: v_mov_b32_e32 v2, 62
|
|
; CI-NEXT: v_mov_b32_e32 v3, 63
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; CI-NEXT: s_nop 0
|
|
; CI-NEXT: v_mov_b32_e32 v0, 11
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
|
|
; CI-NEXT: s_endpgm
|
|
store i32 13, ptr addrspace(1) %out, align 4
|
|
%idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
|
|
store i32 15, ptr addrspace(1) %idx1, align 4
|
|
%idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
|
|
store i32 62, ptr addrspace(1) %idx2, align 4
|
|
%idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
|
|
store i32 63, ptr addrspace(1) %idx3, align 4
|
|
%idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
|
|
store i32 11, ptr addrspace(1) %idx4, align 4
|
|
%idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
|
|
store i32 123, ptr addrspace(1) %idx5, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
|
|
; SI-LABEL: merge_global_store_7_constants_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: v_mov_b32_e32 v0, 34
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x3e7
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0x41
|
|
; SI-NEXT: v_mov_b32_e32 v3, 33
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0xd4
|
|
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0x62
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x5b
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_7_constants_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: v_mov_b32_e32 v0, 34
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0x3e7
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0x41
|
|
; CI-NEXT: v_mov_b32_e32 v3, 33
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; CI-NEXT: s_nop 0
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x62
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0x5b
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0xd4
|
|
; CI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:16
|
|
; CI-NEXT: s_endpgm
|
|
store i32 34, ptr addrspace(1) %out, align 4
|
|
%idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
|
|
store i32 999, ptr addrspace(1) %idx1, align 4
|
|
%idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
|
|
store i32 65, ptr addrspace(1) %idx2, align 4
|
|
%idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
|
|
store i32 33, ptr addrspace(1) %idx3, align 4
|
|
%idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
|
|
store i32 98, ptr addrspace(1) %idx4, align 4
|
|
%idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
|
|
store i32 91, ptr addrspace(1) %idx5, align 4
|
|
%idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
|
|
store i32 212, ptr addrspace(1) %idx6, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
|
|
; SI-LABEL: merge_global_store_8_constants_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
; SI-NEXT: v_mov_b32_e32 v2, 34
|
|
; SI-NEXT: v_mov_b32_e32 v3, 0x3e7
|
|
; SI-NEXT: v_mov_b32_e32 v4, 0x41
|
|
; SI-NEXT: v_mov_b32_e32 v5, 33
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0
|
|
; SI-NEXT: v_mov_b32_e32 v0, 0x62
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x5b
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0xd4
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: merge_global_store_8_constants_i32:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s3, 0xf000
|
|
; CI-NEXT: s_mov_b32 s2, -1
|
|
; CI-NEXT: v_mov_b32_e32 v2, 34
|
|
; CI-NEXT: v_mov_b32_e32 v3, 0x3e7
|
|
; CI-NEXT: v_mov_b32_e32 v4, 0x41
|
|
; CI-NEXT: v_mov_b32_e32 v5, 33
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0
|
|
; CI-NEXT: v_mov_b32_e32 v0, 0x62
|
|
; CI-NEXT: v_mov_b32_e32 v1, 0x5b
|
|
; CI-NEXT: v_mov_b32_e32 v2, 0xd4
|
|
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; CI-NEXT: s_endpgm
|
|
store i32 34, ptr addrspace(1) %out, align 4
|
|
%idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
|
|
store i32 999, ptr addrspace(1) %idx1, align 4
|
|
%idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
|
|
store i32 65, ptr addrspace(1) %idx2, align 4
|
|
%idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
|
|
store i32 33, ptr addrspace(1) %idx3, align 4
|
|
%idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
|
|
store i32 98, ptr addrspace(1) %idx4, align 4
|
|
%idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
|
|
store i32 91, ptr addrspace(1) %idx5, align 4
|
|
%idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
|
|
store i32 212, ptr addrspace(1) %idx6, align 4
|
|
%idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7
|
|
store i32 999, ptr addrspace(1) %idx7, align 4
|
|
ret void
|
|
}
|
|
|
|
; This requires handling of scalar_to_vector for v2i64 to avoid
|
|
; scratch usage.
|
|
; FIXME: Should do single load and store
|
|
define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
|
|
; SI-LABEL: copy_v3i32_align4:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_mov_b32 s10, s6
|
|
; SI-NEXT: s_mov_b32 s11, s7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s2
|
|
; SI-NEXT: s_mov_b32 s9, s3
|
|
; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8
|
|
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: copy_v3i32_align4:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_mov_b32 s10, s6
|
|
; CI-NEXT: s_mov_b32 s11, s7
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_mov_b32 s8, s2
|
|
; CI-NEXT: s_mov_b32 s9, s3
|
|
; CI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_waitcnt vmcnt(0)
|
|
; CI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
%vec = load <3 x i32>, ptr addrspace(1) %in, align 4
|
|
store <3 x i32> %vec, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
; GCN: ScratchSize: 0{{$}}
|
|
|
|
define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
|
|
; GCN-LABEL: copy_v3i64_align4:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
|
|
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
|
|
; GCN-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%vec = load <3 x i64>, ptr addrspace(1) %in, align 4
|
|
store <3 x i64> %vec, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
; GCN: ScratchSize: 0{{$}}
|
|
|
|
define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
|
|
; SI-LABEL: copy_v3f32_align4:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; SI-NEXT: s_mov_b32 s6, -1
|
|
; SI-NEXT: s_mov_b32 s10, s6
|
|
; SI-NEXT: s_mov_b32 s11, s7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: s_mov_b32 s8, s2
|
|
; SI-NEXT: s_mov_b32 s9, s3
|
|
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
; SI-NEXT: s_mov_b32 s5, s1
|
|
; SI-NEXT: s_waitcnt vmcnt(1)
|
|
; SI-NEXT: v_add_f32_e32 v1, 2.0, v1
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: v_add_f32_e32 v2, 4.0, v2
|
|
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
|
|
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; CI-LABEL: copy_v3f32_align4:
|
|
; CI: ; %bb.0:
|
|
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; CI-NEXT: s_mov_b32 s7, 0xf000
|
|
; CI-NEXT: s_mov_b32 s6, -1
|
|
; CI-NEXT: s_mov_b32 s10, s6
|
|
; CI-NEXT: s_mov_b32 s11, s7
|
|
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CI-NEXT: s_mov_b32 s8, s2
|
|
; CI-NEXT: s_mov_b32 s9, s3
|
|
; CI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
|
|
; CI-NEXT: s_mov_b32 s4, s0
|
|
; CI-NEXT: s_mov_b32 s5, s1
|
|
; CI-NEXT: s_waitcnt vmcnt(0)
|
|
; CI-NEXT: v_add_f32_e32 v2, 4.0, v2
|
|
; CI-NEXT: v_add_f32_e32 v1, 2.0, v1
|
|
; CI-NEXT: v_add_f32_e32 v0, 1.0, v0
|
|
; CI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
|
|
; CI-NEXT: s_endpgm
|
|
%vec = load <3 x float>, ptr addrspace(1) %in, align 4
|
|
%fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
|
|
store <3 x float> %fadd, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
; GCN: ScratchSize: 0{{$}}
|
|
|
|
define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
|
|
; GCN-LABEL: copy_v3f64_align4:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 offset:16
|
|
; GCN-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 2.0
|
|
; GCN-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 4.0
|
|
; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
|
|
; GCN-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
|
|
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NEXT: s_endpgm
|
|
%vec = load <3 x double>, ptr addrspace(1) %in, align 4
|
|
%fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
|
|
store <3 x double> %fadd, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
; GCN: ScratchSize: 0{{$}}
|
|
|
|
declare void @llvm.amdgcn.s.barrier() #1
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { convergent nounwind }
|