llvm-project/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll

; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; The bitcast should be pushed through the bitcasts so the vectors can
; be broken down and the shared components can be CSEd

; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(ptr addrspace(1) %out, <8 x i32> %vec) {
  %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

  %vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>
  store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
  ret void
}

; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(ptr addrspace(1) %out, <4 x i64> %vec) {
  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

  %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>
  store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
  ret void
}

; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %vec) {
  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
  store volatile <4 x double> %vec0.bc, ptr addrspace(1) %out

  %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>
  store volatile <4 x double> %vec1.bc, ptr addrspace(1) %out
  ret void
}

; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
; GCN-NOT: v_mov_b32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(1) %out, <16 x i16> %vec) {
  %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, ptr addrspace(1) %out

  %vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>
  store volatile <8 x float> %vec1.bc, ptr addrspace(1) %out
  ret void
}

; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
; GCN-NOT: store_dword
define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
  %bc = bitcast i64 %undef to <2 x i32>
  store <2 x i32> %bc, ptr addrspace(1) %out
  ret void
}

; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
; GCN-NOT: store_dword
define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
  %bc = bitcast i64 %undef to <2 x i32>
  %elt1 = extractelement <2 x i32> %bc, i32 1
  store i32 %elt1, ptr addrspace(1) %out
  ret void
}

declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone convergent }