llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
Gaëtan Bossu 9828745661
[AArch64][ISel] Select constructive EXT_ZZI pseudo instruction (#152554)
The patch adds patterns to select the EXT_ZZI_CONSTRUCTIVE pseudo
instead of the EXT_ZZI destructive instruction for vector_splice. This
only works when the two inputs to vector_splice are identical.

Given that registers aren't tied anymore, this gives the register
allocator more freedom and a lot of MOVs get replaced with MOVPRFX.

In some cases however, we could have just chosen the same input and
output register, but regalloc preferred not to. This means we end up
with some test cases now having more instructions: there is now a
MOVPRFX while no MOV was previously needed.
2025-08-15 14:30:24 +01:00

1646 lines
61 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
;
; SREM
;
; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: srem_v8i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0
; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v8i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: umov w8, v2.h[0]
; VBITS_GE_256-NEXT: umov w9, v2.h[1]
; VBITS_GE_256-NEXT: fmov s3, w8
; VBITS_GE_256-NEXT: umov w8, v2.h[2]
; VBITS_GE_256-NEXT: mov v3.b[1], w9
; VBITS_GE_256-NEXT: mov v3.b[2], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[3]
; VBITS_GE_256-NEXT: mov v3.b[3], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[4]
; VBITS_GE_256-NEXT: mov v3.b[4], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[5]
; VBITS_GE_256-NEXT: mov v3.b[5], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[6]
; VBITS_GE_256-NEXT: mov v3.b[6], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[7]
; VBITS_GE_256-NEXT: mov v3.b[7], w8
; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: srem_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b
; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: umov w8, v2.h[0]
; VBITS_GE_512-NEXT: umov w9, v2.h[1]
; VBITS_GE_512-NEXT: fmov s3, w8
; VBITS_GE_512-NEXT: umov w8, v2.h[2]
; VBITS_GE_512-NEXT: mov v3.b[1], w9
; VBITS_GE_512-NEXT: mov v3.b[2], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[3]
; VBITS_GE_512-NEXT: mov v3.b[3], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[4]
; VBITS_GE_512-NEXT: mov v3.b[4], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[5]
; VBITS_GE_512-NEXT: mov v3.b[5], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[6]
; VBITS_GE_512-NEXT: mov v3.b[6], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[7]
; VBITS_GE_512-NEXT: mov v3.b[7], w8
; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = srem <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: srem_v16i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: sshll v5.8h, v0.8b, #0
; VBITS_GE_128-NEXT: sshll2 v7.4s, v5.8h, #0
; VBITS_GE_128-NEXT: sshll v5.4s, v5.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: sshll v3.8h, v1.8b, #0
; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0
; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
; VBITS_GE_128-NEXT: uzp1 v3.8h, v3.8h, v6.8h
; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b
; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v16i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z5.s, z3.h
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: splice z3.h, p0, z3.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b
; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: srem_v16i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b
; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b
; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = srem <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @srem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ptrue p1.s, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.h, z1.b
; CHECK-NEXT: sunpklo z3.h, z0.b
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = srem <32 x i8> %op1, %op2
store <32 x i8> %res, ptr %a
ret void
}
define void @srem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.h, z1.b
; CHECK-NEXT: sunpklo z3.h, z0.b
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%op2 = load <64 x i8>, ptr %b
%res = srem <64 x i8> %op1, %op2
store <64 x i8> %res, ptr %a
ret void
}
define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.h, z1.b
; CHECK-NEXT: sunpklo z3.h, z0.b
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: sunpklo z5.s, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h
; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
%res = srem <128 x i8> %op1, %op2
store <128 x i8> %res, ptr %a
ret void
}
define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.h, z1.b
; CHECK-NEXT: sunpklo z3.h, z0.b
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: sunpklo z5.s, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: movprfx z5, z0
; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128
; CHECK-NEXT: sunpklo z5.h, z5.b
; CHECK-NEXT: sunpklo z7.s, z5.h
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128
; CHECK-NEXT: sunpklo z5.s, z5.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: movprfx z3, z1
; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
; CHECK-NEXT: sunpklo z3.h, z3.b
; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
; CHECK-NEXT: sunpklo z6.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h
; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h
; CHECK-NEXT: ptrue p1.b, vl128
; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b
; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
%res = srem <256 x i8> %op1, %op2
store <256 x i8> %res, ptr %a
ret void
}
; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: srem_v4i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll v2.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sshll v3.4s, v0.4h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s
; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v4i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: sshll v2.4s, v1.4h, #0
; VBITS_GE_256-NEXT: sshll v3.4s, v0.4h, #0
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: mov w8, v2.s[1]
; VBITS_GE_256-NEXT: mov v3.16b, v2.16b
; VBITS_GE_256-NEXT: mov w9, v2.s[2]
; VBITS_GE_256-NEXT: mov v3.h[1], w8
; VBITS_GE_256-NEXT: mov w8, v2.s[3]
; VBITS_GE_256-NEXT: mov v3.h[2], w9
; VBITS_GE_256-NEXT: mov v3.h[3], w8
; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: srem_v4i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: sshll v2.4s, v1.4h, #0
; VBITS_GE_512-NEXT: sshll v3.4s, v0.4h, #0
; VBITS_GE_512-NEXT: ptrue p0.s, vl4
; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: mov w8, v2.s[1]
; VBITS_GE_512-NEXT: mov v3.16b, v2.16b
; VBITS_GE_512-NEXT: mov w9, v2.s[2]
; VBITS_GE_512-NEXT: mov v3.h[1], w8
; VBITS_GE_512-NEXT: mov w8, v2.s[3]
; VBITS_GE_512-NEXT: mov v3.h[2], w9
; VBITS_GE_512-NEXT: mov v3.h[3], w8
; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h
; VBITS_GE_512-NEXT: ret
%res = srem <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: srem_v8i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sshll v4.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: sshll v3.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z4.s
; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h
; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: srem_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = srem <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @srem_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: srem_v16i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: sshll2 v5.4s, v4.8h, #0
; VBITS_GE_128-NEXT: sshll v16.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: ldr q3, [x0]
; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0
; VBITS_GE_128-NEXT: sshll v7.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
; VBITS_GE_128-NEXT: sshll v6.4s, v4.4h, #0
; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
; VBITS_GE_128-NEXT: sshll v7.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h
; VBITS_GE_128-NEXT: mls v3.8h, v5.8h, v4.8h
; VBITS_GE_128-NEXT: uzp1 v2.8h, v7.8h, v2.8h
; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_128-NEXT: stp q3, q0, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v16i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: movprfx z4, z0
; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_256-NEXT: movprfx z3, z1
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: sdivr z3.s, p1/m, z3.s, z4.s
; VBITS_GE_256-NEXT: ptrue p1.h, vl8
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h
; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: srem_v16i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: ptrue p1.s, vl16
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h
; VBITS_GE_512-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = srem <16 x i16> %op1, %op2
store <16 x i16> %res, ptr %a
ret void
}
define void @srem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: ptrue p1.s, vl32
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.s, z1.h
; CHECK-NEXT: sunpklo z3.s, z0.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%op2 = load <32 x i16>, ptr %b
%res = srem <32 x i16> %op1, %op2
store <32 x i16> %res, ptr %a
ret void
}
define void @srem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.s, z1.h
; CHECK-NEXT: sunpklo z3.s, z0.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%op2 = load <64 x i16>, ptr %b
%res = srem <64 x i16> %op1, %op2
store <64 x i16> %res, ptr %a
ret void
}
define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.s, z1.h
; CHECK-NEXT: sunpklo z3.s, z0.h
; CHECK-NEXT: movprfx z4, z0
; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128
; CHECK-NEXT: sunpklo z4.s, z4.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: movprfx z3, z1
; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
%res = srem <128 x i16> %op1, %op2
store <128 x i16> %res, ptr %a
ret void
}
; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = srem <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = srem <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: srem_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = srem <8 x i32> %op1, %op2
store <8 x i32> %res, ptr %a
ret void
}
define void @srem_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: srem_v16i32:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
; VBITS_GE_128-NEXT: movprfx z4, z1
; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z0.s
; VBITS_GE_128-NEXT: movprfx z19, z2
; VBITS_GE_128-NEXT: sdiv z19.s, p0/m, z19.s, z3.s
; VBITS_GE_128-NEXT: movprfx z7, z5
; VBITS_GE_128-NEXT: sdiv z7.s, p0/m, z7.s, z6.s
; VBITS_GE_128-NEXT: movprfx z18, z16
; VBITS_GE_128-NEXT: sdiv z18.s, p0/m, z18.s, z17.s
; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s
; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s
; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s
; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s
; VBITS_GE_128-NEXT: stp q1, q2, [x0]
; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: movprfx z2, z0
; VBITS_GE_256-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
; VBITS_GE_256-NEXT: movprfx z5, z3
; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z4.s
; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: srem_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: movprfx z2, z0
; VBITS_GE_512-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%op2 = load <16 x i32>, ptr %b
%res = srem <16 x i32> %op1, %op2
store <16 x i32> %res, ptr %a
ret void
}
define void @srem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%op2 = load <32 x i32>, ptr %b
%res = srem <32 x i32> %op1, %op2
store <32 x i32> %res, ptr %a
ret void
}
define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%op2 = load <64 x i32>, ptr %b
%res = srem <64 x i32> %op1, %op2
store <64 x i32> %res, ptr %a
ret void
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = srem <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = srem <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: srem_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = srem <4 x i64> %op1, %op2
store <4 x i64> %res, ptr %a
ret void
}
define void @srem_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: srem_v8i64:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
; VBITS_GE_128-NEXT: movprfx z4, z1
; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z0.d
; VBITS_GE_128-NEXT: movprfx z19, z2
; VBITS_GE_128-NEXT: sdiv z19.d, p0/m, z19.d, z3.d
; VBITS_GE_128-NEXT: movprfx z7, z5
; VBITS_GE_128-NEXT: sdiv z7.d, p0/m, z7.d, z6.d
; VBITS_GE_128-NEXT: movprfx z18, z16
; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z17.d
; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d
; VBITS_GE_128-NEXT: movprfx z1, z2
; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d
; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d
; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d
; VBITS_GE_128-NEXT: stp q0, q1, [x0]
; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: movprfx z2, z0
; VBITS_GE_256-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
; VBITS_GE_256-NEXT: movprfx z5, z3
; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z4.d
; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: srem_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: movprfx z2, z0
; VBITS_GE_512-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%op2 = load <8 x i64>, ptr %b
%res = srem <8 x i64> %op1, %op2
store <8 x i64> %res, ptr %a
ret void
}
define void @srem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: srem_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%op2 = load <16 x i64>, ptr %b
%res = srem <16 x i64> %op1, %op2
store <16 x i64> %res, ptr %a
ret void
}
define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: srem_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%op2 = load <32 x i64>, ptr %b
%res = srem <32 x i64> %op1, %op2
store <32 x i64> %res, ptr %a
ret void
}
;
; UREM
;
; Vector vXi8 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: urem_v8i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v8i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: umov w8, v2.h[0]
; VBITS_GE_256-NEXT: umov w9, v2.h[1]
; VBITS_GE_256-NEXT: fmov s3, w8
; VBITS_GE_256-NEXT: umov w8, v2.h[2]
; VBITS_GE_256-NEXT: mov v3.b[1], w9
; VBITS_GE_256-NEXT: mov v3.b[2], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[3]
; VBITS_GE_256-NEXT: mov v3.b[3], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[4]
; VBITS_GE_256-NEXT: mov v3.b[4], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[5]
; VBITS_GE_256-NEXT: mov v3.b[5], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[6]
; VBITS_GE_256-NEXT: mov v3.b[6], w8
; VBITS_GE_256-NEXT: umov w8, v2.h[7]
; VBITS_GE_256-NEXT: mov v3.b[7], w8
; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: urem_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b
; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: umov w8, v2.h[0]
; VBITS_GE_512-NEXT: umov w9, v2.h[1]
; VBITS_GE_512-NEXT: fmov s3, w8
; VBITS_GE_512-NEXT: umov w8, v2.h[2]
; VBITS_GE_512-NEXT: mov v3.b[1], w9
; VBITS_GE_512-NEXT: mov v3.b[2], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[3]
; VBITS_GE_512-NEXT: mov v3.b[3], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[4]
; VBITS_GE_512-NEXT: mov v3.b[4], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[5]
; VBITS_GE_512-NEXT: mov v3.b[5], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[6]
; VBITS_GE_512-NEXT: mov v3.b[6], w8
; VBITS_GE_512-NEXT: umov w8, v2.h[7]
; VBITS_GE_512-NEXT: mov v3.b[7], w8
; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = urem <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: urem_v16i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: ushll v5.8h, v0.8b, #0
; VBITS_GE_128-NEXT: ushll2 v7.4s, v5.8h, #0
; VBITS_GE_128-NEXT: ushll v5.4s, v5.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: ushll v3.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0
; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
; VBITS_GE_128-NEXT: uzp1 v3.8h, v3.8h, v6.8h
; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b
; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v16i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z4.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z5.s, z3.h
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: splice z3.h, p0, z3.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b
; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: urem_v16i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b
; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b
; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = urem <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @urem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ptrue p1.s, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.h, z1.b
; CHECK-NEXT: uunpklo z3.h, z0.b
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = urem <32 x i8> %op1, %op2
store <32 x i8> %res, ptr %a
ret void
}
define void @urem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.h, z1.b
; CHECK-NEXT: uunpklo z3.h, z0.b
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%op2 = load <64 x i8>, ptr %b
%res = urem <64 x i8> %op1, %op2
store <64 x i8> %res, ptr %a
ret void
}
define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.h, z1.b
; CHECK-NEXT: uunpklo z3.h, z0.b
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: uunpklo z5.s, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h
; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
%res = urem <128 x i8> %op1, %op2
store <128 x i8> %res, ptr %a
ret void
}
define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.h, z1.b
; CHECK-NEXT: uunpklo z3.h, z0.b
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: uunpklo z5.s, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: movprfx z5, z0
; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128
; CHECK-NEXT: uunpklo z5.h, z5.b
; CHECK-NEXT: uunpklo z7.s, z5.h
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128
; CHECK-NEXT: uunpklo z5.s, z5.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: movprfx z3, z1
; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
; CHECK-NEXT: uunpklo z3.h, z3.b
; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
; CHECK-NEXT: uunpklo z6.s, z3.h
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h
; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h
; CHECK-NEXT: ptrue p1.b, vl128
; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b
; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
%res = urem <256 x i8> %op1, %op2
store <256 x i8> %res, ptr %a
ret void
}
; Vector vXi16 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: urem_v4i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll v2.4s, v1.4h, #0
; VBITS_GE_128-NEXT: ushll v3.4s, v0.4h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s
; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v4i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ushll v2.4s, v1.4h, #0
; VBITS_GE_256-NEXT: ushll v3.4s, v0.4h, #0
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: mov w8, v2.s[1]
; VBITS_GE_256-NEXT: mov v3.16b, v2.16b
; VBITS_GE_256-NEXT: mov w9, v2.s[2]
; VBITS_GE_256-NEXT: mov v3.h[1], w8
; VBITS_GE_256-NEXT: mov w8, v2.s[3]
; VBITS_GE_256-NEXT: mov v3.h[2], w9
; VBITS_GE_256-NEXT: mov v3.h[3], w8
; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: urem_v4i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ushll v2.4s, v1.4h, #0
; VBITS_GE_512-NEXT: ushll v3.4s, v0.4h, #0
; VBITS_GE_512-NEXT: ptrue p0.s, vl4
; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: mov w8, v2.s[1]
; VBITS_GE_512-NEXT: mov v3.16b, v2.16b
; VBITS_GE_512-NEXT: mov w9, v2.s[2]
; VBITS_GE_512-NEXT: mov v3.h[1], w8
; VBITS_GE_512-NEXT: mov w8, v2.s[3]
; VBITS_GE_512-NEXT: mov v3.h[2], w9
; VBITS_GE_512-NEXT: mov v3.h[3], w8
; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h
; VBITS_GE_512-NEXT: ret
%res = urem <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: urem_v8i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ushll v4.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: ushll v3.4s, v1.4h, #0
; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z4.s
; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h
; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: urem_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = urem <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @urem_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: urem_v16i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: ushll2 v5.4s, v4.8h, #0
; VBITS_GE_128-NEXT: ushll v16.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: ldr q3, [x0]
; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0
; VBITS_GE_128-NEXT: ushll v7.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
; VBITS_GE_128-NEXT: ushll v6.4s, v4.4h, #0
; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
; VBITS_GE_128-NEXT: ushll v7.4s, v1.4h, #0
; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s
; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h
; VBITS_GE_128-NEXT: mls v3.8h, v5.8h, v4.8h
; VBITS_GE_128-NEXT: uzp1 v2.8h, v7.8h, v2.8h
; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
; VBITS_GE_128-NEXT: stp q3, q0, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v16i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: movprfx z4, z0
; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h
; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_256-NEXT: movprfx z3, z1
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: udivr z3.s, p1/m, z3.s, z4.s
; VBITS_GE_256-NEXT: ptrue p1.h, vl8
; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h
; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: urem_v16i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: ptrue p1.s, vl16
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h
; VBITS_GE_512-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h
; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = urem <16 x i16> %op1, %op2
store <16 x i16> %res, ptr %a
ret void
}
define void @urem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: ptrue p1.s, vl32
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.s, z1.h
; CHECK-NEXT: uunpklo z3.s, z0.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%op2 = load <32 x i16>, ptr %b
%res = urem <32 x i16> %op1, %op2
store <32 x i16> %res, ptr %a
ret void
}
define void @urem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.s, z1.h
; CHECK-NEXT: uunpklo z3.s, z0.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%op2 = load <64 x i16>, ptr %b
%res = urem <64 x i16> %op1, %op2
store <64 x i16> %res, ptr %a
ret void
}
define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.s, z1.h
; CHECK-NEXT: uunpklo z3.s, z0.h
; CHECK-NEXT: movprfx z4, z0
; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128
; CHECK-NEXT: uunpklo z4.s, z4.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: movprfx z3, z1
; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
%res = urem <128 x i16> %op1, %op2
store <128 x i16> %res, ptr %a
ret void
}
; Vector v2i32 udiv are not legal for NEON so use SVE when available.
define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = urem <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 udiv are not legal for NEON so use SVE when available.
define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = urem <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: urem_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = urem <8 x i32> %op1, %op2
store <8 x i32> %res, ptr %a
ret void
}
define void @urem_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: urem_v16i32:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
; VBITS_GE_128-NEXT: movprfx z4, z1
; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z0.s
; VBITS_GE_128-NEXT: movprfx z19, z2
; VBITS_GE_128-NEXT: udiv z19.s, p0/m, z19.s, z3.s
; VBITS_GE_128-NEXT: movprfx z7, z5
; VBITS_GE_128-NEXT: udiv z7.s, p0/m, z7.s, z6.s
; VBITS_GE_128-NEXT: movprfx z18, z16
; VBITS_GE_128-NEXT: udiv z18.s, p0/m, z18.s, z17.s
; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s
; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s
; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s
; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s
; VBITS_GE_128-NEXT: stp q1, q2, [x0]
; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: movprfx z2, z0
; VBITS_GE_256-NEXT: udiv z2.s, p0/m, z2.s, z1.s
; VBITS_GE_256-NEXT: movprfx z5, z3
; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z4.s
; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: urem_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: movprfx z2, z0
; VBITS_GE_512-NEXT: udiv z2.s, p0/m, z2.s, z1.s
; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%op2 = load <16 x i32>, ptr %b
%res = urem <16 x i32> %op1, %op2
store <16 x i32> %res, ptr %a
ret void
}
define void @urem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%op2 = load <32 x i32>, ptr %b
%res = urem <32 x i32> %op1, %op2
store <32 x i32> %res, ptr %a
ret void
}
define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%op2 = load <64 x i32>, ptr %b
%res = urem <64 x i32> %op1, %op2
store <64 x i32> %res, ptr %a
ret void
}
; Vector i64 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = urem <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for the 128 bits case here.
define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = urem <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: urem_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = urem <4 x i64> %op1, %op2
store <4 x i64> %res, ptr %a
ret void
}
define void @urem_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: urem_v8i64:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32]
; VBITS_GE_128-NEXT: movprfx z4, z1
; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z0.d
; VBITS_GE_128-NEXT: movprfx z19, z2
; VBITS_GE_128-NEXT: udiv z19.d, p0/m, z19.d, z3.d
; VBITS_GE_128-NEXT: movprfx z7, z5
; VBITS_GE_128-NEXT: udiv z7.d, p0/m, z7.d, z6.d
; VBITS_GE_128-NEXT: movprfx z18, z16
; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z17.d
; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d
; VBITS_GE_128-NEXT: movprfx z1, z2
; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d
; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d
; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d
; VBITS_GE_128-NEXT: stp q0, q1, [x0]
; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: movprfx z2, z0
; VBITS_GE_256-NEXT: udiv z2.d, p0/m, z2.d, z1.d
; VBITS_GE_256-NEXT: movprfx z5, z3
; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z4.d
; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d
; VBITS_GE_256-NEXT: movprfx z1, z3
; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: urem_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: movprfx z2, z0
; VBITS_GE_512-NEXT: udiv z2.d, p0/m, z2.d, z1.d
; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%op2 = load <8 x i64>, ptr %b
%res = urem <8 x i64> %op1, %op2
store <8 x i64> %res, ptr %a
ret void
}
define void @urem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: urem_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%op2 = load <16 x i64>, ptr %b
%res = urem <16 x i64> %op1, %op2
store <16 x i64> %res, ptr %a
ret void
}
define void @urem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: urem_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%op2 = load <32 x i64>, ptr %b
%res = urem <32 x i64> %op1, %op2
store <32 x i64> %res, ptr %a
ret void
}
attributes #0 = { "target-features"="+sve" }