[AArch64] Use correct regclass for spills of ZPR2/ZPR4 (#148806)

Commit a6293228fdd5aba8c04c63f02f3d017443feb3f2 forced the register
class of ZPR[24]StridedOrContiguous for spills/fills of ZPR2 and ZPR4,
but this may result in issues when the regclass for the fill is a
ZPR2/ZPR4 which would allow the register allocator to pick `z1_z2`,
which is not a supported register for ZPR2StridedOrContiguous that only
supports tuples of the form (strided) `z0_z8`, `z1_z9` or (contiguous,
start at multiple of 2) `z0_z1`, `z2_z3`. For spills we could add a new
register class that supports any of the tuple forms, but I've decided
to use two pseudos similar to the fills for consistency.

Fixes https://github.com/llvm/llvm-project/issues/148655
This commit is contained in:
Sander de Smalen 2025-07-15 14:53:05 +01:00 committed by GitHub
parent bda56023c9
commit 50e345ef95
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 73 additions and 38 deletions

View File

@ -1591,18 +1591,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
"Non-writeback variants of STGloop / STZGloop should not "
"survive past PrologEpilogInserter.");
case AArch64::STR_ZZZZXI:
case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
case AArch64::STR_ZZZXI:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
case AArch64::STR_ZZXI:
case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
case AArch64::STR_PPXI:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
case AArch64::LDR_ZZZZXI:
case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
case AArch64::LDR_ZZZXI:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
case AArch64::LDR_ZZXI:
case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
case AArch64::LDR_PPXI:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);

View File

@ -2482,8 +2482,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::LDR_PXI:
case AArch64::LDR_ZXI:
case AArch64::LDR_ZZXI:
case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZZXI:
case AArch64::LDR_ZZZZXI:
case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDRBBui:
case AArch64::LDRBui:
case AArch64::LDRDui:
@ -2525,8 +2527,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::STR_PXI:
case AArch64::STR_ZXI:
case AArch64::STR_ZZXI:
case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::STR_ZZZXI:
case AArch64::STR_ZZZZXI:
case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::STRBBui:
case AArch64::STRBui:
case AArch64::STRDui:
@ -4318,7 +4322,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
break;
// SVE
case AArch64::STR_ZZZZXI:
case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZZZXI:
case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
Scale = TypeSize::getScalable(16);
Width = TypeSize::getScalable(16 * 4);
MinOffset = -256;
@ -4332,7 +4338,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
MaxOffset = 253;
break;
case AArch64::STR_ZZXI:
case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZXI:
case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
Scale = TypeSize::getScalable(16);
Width = TypeSize::getScalable(16 * 2);
MinOffset = -256;
@ -5559,8 +5567,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Twov2d;
Offset = false;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
} else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
StackID = TargetStackID::ScalableVector;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZXI;
@ -5584,8 +5596,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Fourv2d;
Offset = false;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
} else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
StackID = TargetStackID::ScalableVector;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZZZXI;
@ -5736,8 +5752,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Twov2d;
Offset = false;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
} else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
StackID = TargetStackID::ScalableVector;
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZXI;
@ -5761,8 +5781,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Fourv2d;
Offset = false;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
} else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
StackID = TargetStackID::ScalableVector;
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZZZXI;

View File

@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in {
// These get expanded to individual LDR_ZXI/STR_ZXI instructions in
// AArch64ExpandPseudoInsts.
let mayLoad = 1, hasSideEffects = 0 in {
def LDR_ZZXI : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
}
let mayStore = 1, hasSideEffects = 0 in {
def STR_ZZXI : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
}
let AddedComplexity = 1 in {

View File

@ -1,5 +1,5 @@
# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s
# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s
# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s --check-prefix=EXPAND
--- |
; ModuleID = '<stdin>'
source_filename = "<stdin>"
@ -14,13 +14,14 @@
define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #2 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #2 { entry: unreachable }
attributes #0 = { nounwind "target-features"="+sve" }
attributes #1 = { nounwind "target-features"="+sve2p1" }
attributes #2 = { nounwind "target-features"="+sve,+sme2" "aarch64_pstate_sm_enabled" }
...
---
@ -318,10 +319,10 @@ registers:
- { id: 0, class: zpr2 }
stack:
liveins:
- { reg: '$z0_z1', virtual-reg: '%0' }
- { reg: '$z1_z2', virtual-reg: '%0' }
body: |
bb.0.entry:
liveins: $z0_z1
liveins: $z1_z2
; CHECK-LABEL: name: spills_fills_stack_id_zpr2
; CHECK: stack:
@ -329,12 +330,12 @@ body: |
; CHECK-NEXT: stack-id: scalable-vector
; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
; EXPAND: STR_ZXI $z0, $sp, 0
; EXPAND: STR_ZXI $z1, $sp, 1
; EXPAND: $z0 = LDR_ZXI $sp, 0
; EXPAND: $z1 = LDR_ZXI $sp, 1
; EXPAND: STR_ZXI $z1, $sp, 0
; EXPAND: STR_ZXI $z2, $sp, 1
; EXPAND: $z1 = LDR_ZXI $sp, 0
; EXPAND: $z2 = LDR_ZXI $sp, 1
%0:zpr2 = COPY $z0_z1
%0:zpr2 = COPY $z1_z2
$z0_z1_z2_z3 = IMPLICIT_DEF
$z4_z5_z6_z7 = IMPLICIT_DEF
@ -345,7 +346,7 @@ body: |
$z24_z25_z26_z27 = IMPLICIT_DEF
$z28_z29_z30_z31 = IMPLICIT_DEF
$z0_z1 = COPY %0
$z1_z2 = COPY %0
RET_ReallyLR
...
---
@ -439,10 +440,10 @@ registers:
- { id: 0, class: zpr4 }
stack:
liveins:
- { reg: '$z0_z1_z2_z3', virtual-reg: '%0' }
- { reg: '$z1_z2_z3_z4', virtual-reg: '%0' }
body: |
bb.0.entry:
liveins: $z0_z1_z2_z3
liveins: $z1_z2_z3_z4
; CHECK-LABEL: name: spills_fills_stack_id_zpr4
; CHECK: stack:
@ -450,16 +451,16 @@ body: |
; CHECK-NEXT: stack-id: scalable-vector
; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
; EXPAND: STR_ZXI $z0, $sp, 0
; EXPAND: STR_ZXI $z1, $sp, 1
; EXPAND: STR_ZXI $z2, $sp, 2
; EXPAND: STR_ZXI $z3, $sp, 3
; EXPAND: $z0 = LDR_ZXI $sp, 0
; EXPAND: $z1 = LDR_ZXI $sp, 1
; EXPAND: $z2 = LDR_ZXI $sp, 2
; EXPAND: $z3 = LDR_ZXI $sp, 3
; EXPAND: STR_ZXI $z1, $sp, 0
; EXPAND: STR_ZXI $z2, $sp, 1
; EXPAND: STR_ZXI $z3, $sp, 2
; EXPAND: STR_ZXI $z4, $sp, 3
; EXPAND: $z1 = LDR_ZXI $sp, 0
; EXPAND: $z2 = LDR_ZXI $sp, 1
; EXPAND: $z3 = LDR_ZXI $sp, 2
; EXPAND: $z4 = LDR_ZXI $sp, 3
%0:zpr4 = COPY $z0_z1_z2_z3
%0:zpr4 = COPY $z1_z2_z3_z4
$z0_z1_z2_z3 = IMPLICIT_DEF
$z4_z5_z6_z7 = IMPLICIT_DEF
@ -470,7 +471,7 @@ body: |
$z24_z25_z26_z27 = IMPLICIT_DEF
$z28_z29_z30_z31 = IMPLICIT_DEF
$z0_z1_z2_z3 = COPY %0
$z1_z2_z3_z4 = COPY %0
RET_ReallyLR
...
---