
The hasOneUse check was failing in any case where the load was part of a chain - we should only be checking if the loaded value has one use, and any updates to the chain should be handled by the fold calling shouldReduceLoadWidth. I've updated the x86 implementation to match, although it has no effect here yet (I'm still looking at how to improve the x86 implementation) as the inner for loop was discarding chain uses anyway. By using SDValue::hasOneUse instead this patch exposes a missing dependency on the LLVMSelectionDAG library in a lot of tools + unittests, which resulted in having to make SDNode::hasNUsesOfValue inline. Noticed while fighting the x86 regressions in #122671
67 lines
2.6 KiB
LLVM
67 lines
2.6 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone -mattr=+slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,SPLITTING
|
|
; RUN: llc < %s -mtriple=aarch64 -mattr=-slow-misaligned-128store | FileCheck %s --check-prefixes=CHECK,MISALIGNED
|
|
|
|
@g0 = external dso_local global <3 x float>, align 16
|
|
@g1 = external dso_local global <3 x float>, align 4
|
|
|
|
define void @blam() {
|
|
; SPLITTING-LABEL: blam:
|
|
; SPLITTING: // %bb.0:
|
|
; SPLITTING-NEXT: adrp x8, g1
|
|
; SPLITTING-NEXT: add x8, x8, :lo12:g1
|
|
; SPLITTING-NEXT: adrp x9, g0
|
|
; SPLITTING-NEXT: ldr d0, [x9, :lo12:g0]
|
|
; SPLITTING-NEXT: str d0, [x8]
|
|
; SPLITTING-NEXT: ret
|
|
;
|
|
; MISALIGNED-LABEL: blam:
|
|
; MISALIGNED: // %bb.0:
|
|
; MISALIGNED-NEXT: adrp x8, g0
|
|
; MISALIGNED-NEXT: ldr d0, [x8, :lo12:g0]
|
|
; MISALIGNED-NEXT: adrp x8, g1
|
|
; MISALIGNED-NEXT: add x8, x8, :lo12:g1
|
|
; MISALIGNED-NEXT: str d0, [x8]
|
|
; MISALIGNED-NEXT: ret
|
|
%tmp4 = getelementptr inbounds <3 x float>, ptr @g1, i64 0, i64 0
|
|
%tmp5 = load <3 x float>, ptr @g0, align 16
|
|
%tmp6 = extractelement <3 x float> %tmp5, i64 0
|
|
store float %tmp6, ptr %tmp4
|
|
%tmp7 = getelementptr inbounds float, ptr %tmp4, i64 1
|
|
%tmp8 = load <3 x float>, ptr @g0, align 16
|
|
%tmp9 = extractelement <3 x float> %tmp8, i64 1
|
|
store float %tmp9, ptr %tmp7
|
|
ret void;
|
|
}
|
|
|
|
|
|
; PR21711 - Merge vector stores into wider vector stores.
|
|
|
|
; On Cyclone, the stores should not get merged into a 16-byte store because
|
|
; unaligned 16-byte stores are slow. This test would infinite loop when
|
|
; the fastness of unaligned accesses was not specified correctly.
|
|
|
|
define void @merge_vec_extract_stores(<4 x float> %v1, ptr %ptr) {
|
|
; SPLITTING-LABEL: merge_vec_extract_stores:
|
|
; SPLITTING: // %bb.0:
|
|
; SPLITTING-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; SPLITTING-NEXT: stp d0, d1, [x0, #24]
|
|
; SPLITTING-NEXT: ret
|
|
;
|
|
; MISALIGNED-LABEL: merge_vec_extract_stores:
|
|
; MISALIGNED: // %bb.0:
|
|
; MISALIGNED-NEXT: stur q0, [x0, #24]
|
|
; MISALIGNED-NEXT: ret
|
|
%idx0 = getelementptr inbounds <2 x float>, ptr %ptr, i64 3
|
|
%idx1 = getelementptr inbounds <2 x float>, ptr %ptr, i64 4
|
|
|
|
%shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
|
|
%shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
|
|
|
|
store <2 x float> %shuffle0, ptr %idx0, align 8
|
|
store <2 x float> %shuffle1, ptr %idx1, align 8
|
|
ret void
|
|
}
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; CHECK: {{.*}}
|