llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
Sander de Smalen 61510b51c3 Revert "[AArch64] Enable subreg liveness tracking by default."
This reverts commit 9c319d5bb40785c969d2af76535ca62448dfafa7.

Some issues were discovered with the bootstrap builds, which
seem like they were caused by this commit. I'm reverting to investigate.
2024-12-12 17:22:15 +00:00

1472 lines
54 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
;
; SDIV
;
; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: sdiv_v8i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0
; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v8i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h
; VBITS_GE_256-NEXT: umov w8, v1.h[0]
; VBITS_GE_256-NEXT: umov w9, v1.h[1]
; VBITS_GE_256-NEXT: fmov s0, w8
; VBITS_GE_256-NEXT: umov w8, v1.h[2]
; VBITS_GE_256-NEXT: mov v0.b[1], w9
; VBITS_GE_256-NEXT: mov v0.b[2], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[3]
; VBITS_GE_256-NEXT: mov v0.b[3], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[4]
; VBITS_GE_256-NEXT: mov v0.b[4], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[5]
; VBITS_GE_256-NEXT: mov v0.b[5], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[6]
; VBITS_GE_256-NEXT: mov v0.b[6], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[7]
; VBITS_GE_256-NEXT: mov v0.b[7], w8
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h
; VBITS_GE_512-NEXT: umov w8, v1.h[0]
; VBITS_GE_512-NEXT: umov w9, v1.h[1]
; VBITS_GE_512-NEXT: fmov s0, w8
; VBITS_GE_512-NEXT: umov w8, v1.h[2]
; VBITS_GE_512-NEXT: mov v0.b[1], w9
; VBITS_GE_512-NEXT: mov v0.b[2], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[3]
; VBITS_GE_512-NEXT: mov v0.b[3], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[4]
; VBITS_GE_512-NEXT: mov v0.b[4], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[5]
; VBITS_GE_512-NEXT: mov v0.b[5], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[6]
; VBITS_GE_512-NEXT: mov v0.b[6], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[7]
; VBITS_GE_512-NEXT: mov v0.b[7], w8
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_512-NEXT: ret
%res = sdiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: sdiv_v16i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0
; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0
; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0
; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: sshll2 v3.4s, v1.8h, #0
; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h
; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v16i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v16i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = sdiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @sdiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1b { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = sdiv <32 x i8> %op1, %op2
store <32 x i8> %res, ptr %a
ret void
}
define void @sdiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1b { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%op2 = load <64 x i8>, ptr %b
%res = sdiv <64 x i8> %op1, %op2
store <64 x i8> %res, ptr %a
ret void
}
define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: sunpklo z2.s, z1.h
; CHECK-NEXT: sunpklo z3.s, z0.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: st1b { z1.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
%res = sdiv <128 x i8> %op1, %op2
store <128 x i8> %res, ptr %a
ret void
}
define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.h, z1.b
; CHECK-NEXT: sunpklo z3.h, z0.b
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: sunpklo z5.s, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: sunpklo z5.s, z0.h
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: sunpklo z3.s, z1.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
; CHECK-NEXT: ptrue p1.b, vl128
; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b
; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
%res = sdiv <256 x i8> %op1, %op2
store <256 x i8> %res, ptr %a
ret void
}
; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: sdiv_v4i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v4i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: sshll v1.4s, v1.4h, #0
; VBITS_GE_256-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: sdivr z1.s, p0/m, z1.s, z0.s
; VBITS_GE_256-NEXT: mov w8, v1.s[1]
; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
; VBITS_GE_256-NEXT: mov w9, v1.s[2]
; VBITS_GE_256-NEXT: mov v0.h[1], w8
; VBITS_GE_256-NEXT: mov w8, v1.s[3]
; VBITS_GE_256-NEXT: mov v0.h[2], w9
; VBITS_GE_256-NEXT: mov v0.h[3], w8
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v4i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: sshll v1.4s, v1.4h, #0
; VBITS_GE_512-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_512-NEXT: ptrue p0.s, vl4
; VBITS_GE_512-NEXT: sdivr z1.s, p0/m, z1.s, z0.s
; VBITS_GE_512-NEXT: mov w8, v1.s[1]
; VBITS_GE_512-NEXT: mov v0.16b, v1.16b
; VBITS_GE_512-NEXT: mov w9, v1.s[2]
; VBITS_GE_512-NEXT: mov v0.h[1], w8
; VBITS_GE_512-NEXT: mov w8, v1.s[3]
; VBITS_GE_512-NEXT: mov v0.h[2], w9
; VBITS_GE_512-NEXT: mov v0.h[3], w8
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_512-NEXT: ret
%res = sdiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: sdiv_v8i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = sdiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: sdiv_v16i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: sshll2 v5.4s, v4.8h, #0
; VBITS_GE_128-NEXT: sshll v4.4s, v4.4h, #0
; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: ldr q3, [x0]
; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0
; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v1.8h, v3.8h, v5.8h
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: stp q1, q0, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v16i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
; VBITS_GE_256-NEXT: ptrue p1.h, vl8
; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v16i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl16
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = sdiv <16 x i16> %op1, %op2
store <16 x i16> %res, ptr %a
ret void
}
define void @sdiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl32
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1h { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%op2 = load <32 x i16>, ptr %b
%res = sdiv <32 x i16> %op1, %op2
store <32 x i16> %res, ptr %a
ret void
}
define void @sdiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1h { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%op2 = load <64 x i16>, ptr %b
%res = sdiv <64 x i16> %op1, %op2
store <64 x i16> %res, ptr %a
ret void
}
define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.s, z1.h
; CHECK-NEXT: sunpklo z3.s, z0.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
; CHECK-NEXT: st1h { z1.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
%res = sdiv <128 x i16> %op1, %op2
store <128 x i16> %res, ptr %a
ret void
}
; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = sdiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = sdiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = sdiv <8 x i32> %op1, %op2
store <8 x i32> %res, ptr %a
ret void
}
define void @sdiv_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: sdiv_v16i32:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z4.s
; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
; VBITS_GE_128-NEXT: stp q0, q2, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: sdiv z1.s, p0/m, z1.s, z2.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%op2 = load <16 x i32>, ptr %b
%res = sdiv <16 x i32> %op1, %op2
store <16 x i32> %res, ptr %a
ret void
}
define void @sdiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%op2 = load <32 x i32>, ptr %b
%res = sdiv <32 x i32> %op1, %op2
store <32 x i32> %res, ptr %a
ret void
}
define void @sdiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%op2 = load <64 x i32>, ptr %b
%res = sdiv <64 x i32> %op1, %op2
store <64 x i32> %res, ptr %a
ret void
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = sdiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 sdiv are not legal for NEON so use SVE when available.
define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: sdiv_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = sdiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sdiv_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = sdiv <4 x i64> %op1, %op2
store <4 x i64> %res, ptr %a
ret void
}
define void @sdiv_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: sdiv_v8i64:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
; VBITS_GE_128-NEXT: sdivr z0.d, p0/m, z0.d, z1.d
; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z4.d
; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z5.d
; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z3.d
; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
; VBITS_GE_128-NEXT: stp q0, q2, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: sdiv z1.d, p0/m, z1.d, z2.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%op2 = load <8 x i64>, ptr %b
%res = sdiv <8 x i64> %op1, %op2
store <8 x i64> %res, ptr %a
ret void
}
define void @sdiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sdiv_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%op2 = load <16 x i64>, ptr %b
%res = sdiv <16 x i64> %op1, %op2
store <16 x i64> %res, ptr %a
ret void
}
define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sdiv_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%op2 = load <32 x i64>, ptr %b
%res = sdiv <32 x i64> %op1, %op2
store <32 x i64> %res, ptr %a
ret void
}
;
; UDIV
;
; Vector vXi8 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: udiv_v8i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v8i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h
; VBITS_GE_256-NEXT: umov w8, v1.h[0]
; VBITS_GE_256-NEXT: umov w9, v1.h[1]
; VBITS_GE_256-NEXT: fmov s0, w8
; VBITS_GE_256-NEXT: umov w8, v1.h[2]
; VBITS_GE_256-NEXT: mov v0.b[1], w9
; VBITS_GE_256-NEXT: mov v0.b[2], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[3]
; VBITS_GE_256-NEXT: mov v0.b[3], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[4]
; VBITS_GE_256-NEXT: mov v0.b[4], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[5]
; VBITS_GE_256-NEXT: mov v0.b[5], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[6]
; VBITS_GE_256-NEXT: mov v0.b[6], w8
; VBITS_GE_256-NEXT: umov w8, v1.h[7]
; VBITS_GE_256-NEXT: mov v0.b[7], w8
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h
; VBITS_GE_512-NEXT: umov w8, v1.h[0]
; VBITS_GE_512-NEXT: umov w9, v1.h[1]
; VBITS_GE_512-NEXT: fmov s0, w8
; VBITS_GE_512-NEXT: umov w8, v1.h[2]
; VBITS_GE_512-NEXT: mov v0.b[1], w9
; VBITS_GE_512-NEXT: mov v0.b[2], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[3]
; VBITS_GE_512-NEXT: mov v0.b[3], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[4]
; VBITS_GE_512-NEXT: mov v0.b[4], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[5]
; VBITS_GE_512-NEXT: mov v0.b[5], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[6]
; VBITS_GE_512-NEXT: mov v0.b[6], w8
; VBITS_GE_512-NEXT: umov w8, v1.h[7]
; VBITS_GE_512-NEXT: mov v0.b[7], w8
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_512-NEXT: ret
%res = udiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: udiv_v16i8:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0
; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0
; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: ushll2 v3.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h
; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v16i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: ptrue p0.h, vl8
; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v16i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = udiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @udiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0]
; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1b { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = udiv <32 x i8> %op1, %op2
store <32 x i8> %res, ptr %a
ret void
}
define void @udiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1]
; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0]
; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1b { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%op2 = load <64 x i8>, ptr %b
%res = udiv <64 x i8> %op1, %op2
store <64 x i8> %res, ptr %a
ret void
}
define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0]
; CHECK-NEXT: uunpklo z2.s, z0.h
; CHECK-NEXT: uunpklo z3.s, z1.h
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
; CHECK-NEXT: st1b { z1.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
%res = udiv <128 x i8> %op1, %op2
store <128 x i8> %res, ptr %a
ret void
}
define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.h, z1.b
; CHECK-NEXT: uunpklo z3.h, z0.b
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: uunpklo z5.s, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: uunpklo z0.h, z0.b
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: uunpklo z5.s, z0.h
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: uunpklo z3.s, z1.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s
; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
; CHECK-NEXT: ptrue p1.b, vl128
; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b
; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
%res = udiv <256 x i8> %op1, %op2
store <256 x i8> %res, ptr %a
ret void
}
; Vector vXi16 udiv are not legal for NEON so use SVE when available.
; FIXME: We should be able to improve the codegen for >= 256 bits here.
define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: udiv_v4i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v4i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ushll v1.4s, v1.4h, #0
; VBITS_GE_256-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_256-NEXT: ptrue p0.s, vl4
; VBITS_GE_256-NEXT: udivr z1.s, p0/m, z1.s, z0.s
; VBITS_GE_256-NEXT: mov w8, v1.s[1]
; VBITS_GE_256-NEXT: mov v0.16b, v1.16b
; VBITS_GE_256-NEXT: mov w9, v1.s[2]
; VBITS_GE_256-NEXT: mov v0.h[1], w8
; VBITS_GE_256-NEXT: mov w8, v1.s[3]
; VBITS_GE_256-NEXT: mov v0.h[2], w9
; VBITS_GE_256-NEXT: mov v0.h[3], w8
; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v4i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ushll v1.4s, v1.4h, #0
; VBITS_GE_512-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_512-NEXT: ptrue p0.s, vl4
; VBITS_GE_512-NEXT: udivr z1.s, p0/m, z1.s, z0.s
; VBITS_GE_512-NEXT: mov w8, v1.s[1]
; VBITS_GE_512-NEXT: mov v0.16b, v1.16b
; VBITS_GE_512-NEXT: mov w9, v1.s[2]
; VBITS_GE_512-NEXT: mov v0.h[1], w8
; VBITS_GE_512-NEXT: mov w8, v1.s[3]
; VBITS_GE_512-NEXT: mov v0.h[2], w9
; VBITS_GE_512-NEXT: mov v0.h[3], w8
; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0
; VBITS_GE_512-NEXT: ret
%res = udiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: udiv_v8i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v8i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_512-NEXT: ret
%res = udiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @udiv_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: udiv_v16i16:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q4, q1, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldr q0, [x0, #16]
; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
; VBITS_GE_128-NEXT: ushll2 v5.4s, v4.8h, #0
; VBITS_GE_128-NEXT: ushll v4.4s, v4.4h, #0
; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: ldr q3, [x0]
; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0
; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: uzp1 v1.8h, v3.8h, v5.8h
; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: stp q1, q0, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v16i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s
; VBITS_GE_256-NEXT: ptrue p1.h, vl8
; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h
; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v16i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: udivr z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = udiv <16 x i16> %op1, %op2
store <16 x i16> %res, ptr %a
ret void
}
define void @udiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0]
; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1h { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%op2 = load <32 x i16>, ptr %b
%res = udiv <32 x i16> %op1, %op2
store <32 x i16> %res, ptr %a
ret void
}
define void @udiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1]
; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0]
; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1h { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%op2 = load <64 x i16>, ptr %b
%res = udiv <64 x i16> %op1, %op2
store <64 x i16> %res, ptr %a
ret void
}
define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.s, z1.h
; CHECK-NEXT: uunpklo z3.s, z0.h
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
; CHECK-NEXT: ptrue p1.h, vl64
; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
; CHECK-NEXT: st1h { z1.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
%res = udiv <128 x i16> %op1, %op2
store <128 x i16> %res, ptr %a
ret void
}
; Vector v2i32 udiv are not legal for NEON so use SVE when available.
define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = udiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Vector v4i32 udiv are not legal for NEON so use SVE when available.
define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = udiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: udiv_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = udiv <8 x i32> %op1, %op2
store <8 x i32> %res, ptr %a
ret void
}
define void @udiv_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: udiv_v16i32:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z1.s
; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z4.s
; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
; VBITS_GE_128-NEXT: stp q0, q2, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: udiv z1.s, p0/m, z1.s, z2.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%op2 = load <16 x i32>, ptr %b
%res = udiv <16 x i32> %op1, %op2
store <16 x i32> %res, ptr %a
ret void
}
define void @udiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%op2 = load <32 x i32>, ptr %b
%res = udiv <32 x i32> %op1, %op2
store <32 x i32> %res, ptr %a
ret void
}
define void @udiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%op2 = load <64 x i32>, ptr %b
%res = udiv <64 x i32> %op1, %op2
store <64 x i32> %res, ptr %a
ret void
}
; Vector i64 udiv are not legal for NEON so use SVE when available.
define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = udiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Vector i64 udiv are not legal for NEON so use SVE when available.
define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: udiv_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = udiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: udiv_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = udiv <4 x i64> %op1, %op2
store <4 x i64> %res, ptr %a
ret void
}
define void @udiv_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: udiv_v8i64:
; VBITS_GE_128: // %bb.0:
; VBITS_GE_128-NEXT: ldp q0, q3, [x1]
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32]
; VBITS_GE_128-NEXT: udivr z0.d, p0/m, z0.d, z1.d
; VBITS_GE_128-NEXT: ldr q1, [x0, #48]
; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z4.d
; VBITS_GE_128-NEXT: ldr q4, [x0, #32]
; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z5.d
; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z3.d
; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32]
; VBITS_GE_128-NEXT: stp q0, q2, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: udiv z0.d, p0/m, z0.d, z1.d
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: udiv z1.d, p0/m, z1.d, z2.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: udiv z0.d, p0/m, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%op2 = load <8 x i64>, ptr %b
%res = udiv <8 x i64> %op1, %op2
store <8 x i64> %res, ptr %a
ret void
}
define void @udiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: udiv_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%op2 = load <16 x i64>, ptr %b
%res = udiv <16 x i64> %op1, %op2
store <16 x i64> %res, ptr %a
ret void
}
define void @udiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: udiv_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%op2 = load <32 x i64>, ptr %b
%res = udiv <32 x i64> %op1, %op2
store <32 x i64> %res, ptr %a
ret void
}
; This used to crash because isUnaryPredicate and BuildUDIV don't know how
; a SPLAT_VECTOR of fixed vector type should be handled.
define void @udiv_constantsplat_v8i32(ptr %a) vscale_range(2,0) #1 {
; CHECK-LABEL: udiv_constantsplat_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: mov z1.s, #95 // =0x5f
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
store <8 x i32> %res, ptr %a
ret void
}
attributes #0 = { "target-features"="+sve" }
attributes #1 = { "target-features"="+sve" minsize }