llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
Gaëtan Bossu 9828745661
[AArch64][ISel] Select constructive EXT_ZZI pseudo instruction (#152554)
The patch adds patterns to select the EXT_ZZI_CONSTRUCTIVE pseudo
instead of the EXT_ZZI destructive instruction for vector_splice. This
only works when the two inputs to vector_splice are identical.

Given that registers aren't tied anymore, this gives the register
allocator more freedom and a lot of MOVs get replaced with MOVPRFX.

In some cases however, we could have just chosen the same input and
output register, but regalloc preferred not to. This means we end up
with some test cases now having more instructions: there is now a
MOVPRFX while no MOV was previously needed.
2025-08-15 14:30:24 +01:00

1016 lines
35 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,NEON
; RUN: llc -mattr=+sve,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,SVE
; RUN: llc -mattr=+sme,+i8mm -force-streaming < %s | FileCheck %s --check-prefix=SME
target triple = "aarch64"
;
; Two-way mla (i8 -> i16)
;
define <8 x i16> @two_way_i8_i16_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: two_way_i8_i16_vl128:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q0, [x0]
; COMMON-NEXT: ldr q1, [x1]
; COMMON-NEXT: ldr q2, [x2]
; COMMON-NEXT: umlal v0.8h, v2.8b, v1.8b
; COMMON-NEXT: umlal2 v0.8h, v2.16b, v1.16b
; COMMON-NEXT: ret
;
; SME-LABEL: two_way_i8_i16_vl128:
; SME: // %bb.0:
; SME-NEXT: ldr q0, [x0]
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: umlalb z0.h, z2.b, z1.b
; SME-NEXT: umlalt z0.h, z2.b, z1.b
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <8 x i16>, ptr %accptr
%u = load <16 x i8>, ptr %uptr
%s = load <16 x i8>, ptr %sptr
%u.wide = zext <16 x i8> %u to <16 x i16>
%s.wide = zext <16 x i8> %s to <16 x i16>
%mult = mul nuw nsw <16 x i16> %s.wide, %u.wide
%partial.reduce = tail call <8 x i16> @llvm.experimental.vector.partial.reduce.add(<8 x i16> %acc, <16 x i16> %mult)
ret <8 x i16> %partial.reduce
}
define <16 x i16> @two_way_i8_i16_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: two_way_i8_i16_vl128_double_width:
; COMMON: // %bb.0:
; COMMON-NEXT: ldp q0, q1, [x0]
; COMMON-NEXT: ldp q2, q3, [x1]
; COMMON-NEXT: ldp q4, q5, [x2]
; COMMON-NEXT: umlal v0.8h, v4.8b, v2.8b
; COMMON-NEXT: umlal v1.8h, v5.8b, v3.8b
; COMMON-NEXT: umlal2 v0.8h, v4.16b, v2.16b
; COMMON-NEXT: umlal2 v1.8h, v5.16b, v3.16b
; COMMON-NEXT: ret
;
; SME-LABEL: two_way_i8_i16_vl128_double_width:
; SME: // %bb.0:
; SME-NEXT: ldp q0, q1, [x0]
; SME-NEXT: ldp q3, q2, [x1]
; SME-NEXT: ldp q5, q4, [x2]
; SME-NEXT: umlalb z0.h, z5.b, z3.b
; SME-NEXT: umlalb z1.h, z4.b, z2.b
; SME-NEXT: umlalt z0.h, z5.b, z3.b
; SME-NEXT: umlalt z1.h, z4.b, z2.b
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <16 x i16>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i16>
%s.wide = zext <32 x i8> %s to <32 x i16>
%mult = mul nuw nsw <32 x i16> %s.wide, %u.wide
%partial.reduce = tail call <16 x i16> @llvm.experimental.vector.partial.reduce.add(<16 x i16> %acc, <32 x i16> %mult)
ret <16 x i16> %partial.reduce
}
define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
;
;
; NEON-LABEL: two_way_i8_i16_vl256:
; NEON: // %bb.0:
; NEON-NEXT: ldp q0, q1, [x0]
; NEON-NEXT: ldp q2, q3, [x1]
; NEON-NEXT: ldp q4, q5, [x2]
; NEON-NEXT: umlal v0.8h, v4.8b, v2.8b
; NEON-NEXT: umlal v1.8h, v5.8b, v3.8b
; NEON-NEXT: umlal2 v0.8h, v4.16b, v2.16b
; NEON-NEXT: umlal2 v1.8h, v5.16b, v3.16b
; NEON-NEXT: ret
;
; SVE-LABEL: two_way_i8_i16_vl256:
; SVE: // %bb.0:
; SVE-NEXT: ldr z0, [x1]
; SVE-NEXT: ldr z1, [x2]
; SVE-NEXT: ptrue p0.h
; SVE-NEXT: ldr z4, [x0]
; SVE-NEXT: uunpklo z2.h, z0.b
; SVE-NEXT: uunpklo z3.h, z1.b
; SVE-NEXT: uunpkhi z0.h, z0.b
; SVE-NEXT: uunpkhi z1.h, z1.b
; SVE-NEXT: mad z2.h, p0/m, z3.h, z4.h
; SVE-NEXT: mad z0.h, p0/m, z1.h, z2.h
; SVE-NEXT: movprfx z1, z0
; SVE-NEXT: ext z1.b, z1.b, z0.b, #16
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: two_way_i8_i16_vl256:
; SME: // %bb.0:
; SME-NEXT: ldr z0, [x0]
; SME-NEXT: ldr z1, [x1]
; SME-NEXT: ldr z2, [x2]
; SME-NEXT: umlalb z0.h, z2.b, z1.b
; SME-NEXT: umlalt z0.h, z2.b, z1.b
; SME-NEXT: movprfx z1, z0
; SME-NEXT: ext z1.b, z1.b, z0.b, #16
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <16 x i16>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i16>
%s.wide = zext <32 x i8> %s to <32 x i16>
%mult = mul nuw nsw <32 x i16> %s.wide, %u.wide
%partial.reduce = tail call <16 x i16> @llvm.experimental.vector.partial.reduce.add(<16 x i16> %acc, <32 x i16> %mult)
ret <16 x i16> %partial.reduce
}
;
; Two-way mla (i16 -> i32)
;
define <4 x i32> @two_way_i16_i32_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: two_way_i16_i32_vl128:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q0, [x0]
; COMMON-NEXT: ldr q1, [x1]
; COMMON-NEXT: ldr q2, [x2]
; COMMON-NEXT: umlal v0.4s, v2.4h, v1.4h
; COMMON-NEXT: umlal2 v0.4s, v2.8h, v1.8h
; COMMON-NEXT: ret
;
; SME-LABEL: two_way_i16_i32_vl128:
; SME: // %bb.0:
; SME-NEXT: ldr q0, [x0]
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: umlalb z0.s, z2.h, z1.h
; SME-NEXT: umlalt z0.s, z2.h, z1.h
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <4 x i32>, ptr %accptr
%u = load <8 x i16>, ptr %uptr
%s = load <8 x i16>, ptr %sptr
%u.wide = zext <8 x i16> %u to <8 x i32>
%s.wide = zext <8 x i16> %s to <8 x i32>
%mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <8 x i32> %mult)
ret <4 x i32> %partial.reduce
}
define <8 x i32> @two_way_i16_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: two_way_i16_i32_vl128_double_width:
; COMMON: // %bb.0:
; COMMON-NEXT: ldp q0, q1, [x0]
; COMMON-NEXT: ldp q2, q3, [x1]
; COMMON-NEXT: ldp q4, q5, [x2]
; COMMON-NEXT: umlal v0.4s, v4.4h, v2.4h
; COMMON-NEXT: umlal v1.4s, v5.4h, v3.4h
; COMMON-NEXT: umlal2 v0.4s, v4.8h, v2.8h
; COMMON-NEXT: umlal2 v1.4s, v5.8h, v3.8h
; COMMON-NEXT: ret
;
; SME-LABEL: two_way_i16_i32_vl128_double_width:
; SME: // %bb.0:
; SME-NEXT: ldp q0, q1, [x0]
; SME-NEXT: ldp q3, q2, [x1]
; SME-NEXT: ldp q5, q4, [x2]
; SME-NEXT: umlalb z0.s, z5.h, z3.h
; SME-NEXT: umlalb z1.s, z4.h, z2.h
; SME-NEXT: umlalt z0.s, z5.h, z3.h
; SME-NEXT: umlalt z1.s, z4.h, z2.h
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <8 x i32>, ptr %accptr
%u = load <16 x i16>, ptr %uptr
%s = load <16 x i16>, ptr %sptr
%u.wide = zext <16 x i16> %u to <16 x i32>
%s.wide = zext <16 x i16> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
%partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
ret <8 x i32> %partial.reduce
}
define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
;
;
; NEON-LABEL: two_way_i16_i32_vl256:
; NEON: // %bb.0:
; NEON-NEXT: ldp q0, q1, [x0]
; NEON-NEXT: ldp q2, q3, [x1]
; NEON-NEXT: ldp q4, q5, [x2]
; NEON-NEXT: umlal v0.4s, v4.4h, v2.4h
; NEON-NEXT: umlal v1.4s, v5.4h, v3.4h
; NEON-NEXT: umlal2 v0.4s, v4.8h, v2.8h
; NEON-NEXT: umlal2 v1.4s, v5.8h, v3.8h
; NEON-NEXT: ret
;
; SVE-LABEL: two_way_i16_i32_vl256:
; SVE: // %bb.0:
; SVE-NEXT: ldr z0, [x1]
; SVE-NEXT: ldr z1, [x2]
; SVE-NEXT: ptrue p0.s
; SVE-NEXT: ldr z4, [x0]
; SVE-NEXT: uunpklo z2.s, z0.h
; SVE-NEXT: uunpklo z3.s, z1.h
; SVE-NEXT: uunpkhi z0.s, z0.h
; SVE-NEXT: uunpkhi z1.s, z1.h
; SVE-NEXT: mad z2.s, p0/m, z3.s, z4.s
; SVE-NEXT: mad z0.s, p0/m, z1.s, z2.s
; SVE-NEXT: movprfx z1, z0
; SVE-NEXT: ext z1.b, z1.b, z0.b, #16
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: two_way_i16_i32_vl256:
; SME: // %bb.0:
; SME-NEXT: ldr z0, [x0]
; SME-NEXT: ldr z1, [x1]
; SME-NEXT: ldr z2, [x2]
; SME-NEXT: umlalb z0.s, z2.h, z1.h
; SME-NEXT: umlalt z0.s, z2.h, z1.h
; SME-NEXT: movprfx z1, z0
; SME-NEXT: ext z1.b, z1.b, z0.b, #16
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <8 x i32>, ptr %accptr
%u = load <16 x i16>, ptr %uptr
%s = load <16 x i16>, ptr %sptr
%u.wide = zext <16 x i16> %u to <16 x i32>
%s.wide = zext <16 x i16> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
%partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <16 x i32> %mult)
ret <8 x i32> %partial.reduce
}
;
; Two-way mla (i32 -> i64)
;
define <2 x i64> @two_way_i32_i64_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: two_way_i32_i64_vl128:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q0, [x0]
; COMMON-NEXT: ldr q1, [x1]
; COMMON-NEXT: ldr q2, [x2]
; COMMON-NEXT: umlal v0.2d, v2.2s, v1.2s
; COMMON-NEXT: umlal2 v0.2d, v2.4s, v1.4s
; COMMON-NEXT: ret
;
; SME-LABEL: two_way_i32_i64_vl128:
; SME: // %bb.0:
; SME-NEXT: ldr q0, [x0]
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: umlalb z0.d, z2.s, z1.s
; SME-NEXT: umlalt z0.d, z2.s, z1.s
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <2 x i64>, ptr %accptr
%u = load <4 x i32>, ptr %uptr
%s = load <4 x i32>, ptr %sptr
%u.wide = zext <4 x i32> %u to <4 x i64>
%s.wide = zext <4 x i32> %s to <4 x i64>
%mult = mul nuw nsw <4 x i64> %s.wide, %u.wide
%partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <4 x i64> %mult)
ret <2 x i64> %partial.reduce
}
define <4 x i64> @two_way_i32_i64_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: two_way_i32_i64_vl128_double_width:
; COMMON: // %bb.0:
; COMMON-NEXT: ldp q0, q1, [x0]
; COMMON-NEXT: ldp q2, q3, [x1]
; COMMON-NEXT: ldp q4, q5, [x2]
; COMMON-NEXT: umlal v0.2d, v4.2s, v2.2s
; COMMON-NEXT: umlal v1.2d, v5.2s, v3.2s
; COMMON-NEXT: umlal2 v0.2d, v4.4s, v2.4s
; COMMON-NEXT: umlal2 v1.2d, v5.4s, v3.4s
; COMMON-NEXT: ret
;
; SME-LABEL: two_way_i32_i64_vl128_double_width:
; SME: // %bb.0:
; SME-NEXT: ldp q0, q1, [x0]
; SME-NEXT: ldp q3, q2, [x1]
; SME-NEXT: ldp q5, q4, [x2]
; SME-NEXT: umlalb z0.d, z5.s, z3.s
; SME-NEXT: umlalb z1.d, z4.s, z2.s
; SME-NEXT: umlalt z0.d, z5.s, z3.s
; SME-NEXT: umlalt z1.d, z4.s, z2.s
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <4 x i64>, ptr %accptr
%u = load <8 x i32>, ptr %uptr
%s = load <8 x i32>, ptr %sptr
%u.wide = zext <8 x i32> %u to <8 x i64>
%s.wide = zext <8 x i32> %s to <8 x i64>
%mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <8 x i64> %mult)
ret <4 x i64> %partial.reduce
}
define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
;
;
; NEON-LABEL: two_way_i32_i64_vl256:
; NEON: // %bb.0:
; NEON-NEXT: ldp q0, q1, [x0]
; NEON-NEXT: ldp q2, q3, [x1]
; NEON-NEXT: ldp q4, q5, [x2]
; NEON-NEXT: umlal v0.2d, v4.2s, v2.2s
; NEON-NEXT: umlal v1.2d, v5.2s, v3.2s
; NEON-NEXT: umlal2 v0.2d, v4.4s, v2.4s
; NEON-NEXT: umlal2 v1.2d, v5.4s, v3.4s
; NEON-NEXT: ret
;
; SVE-LABEL: two_way_i32_i64_vl256:
; SVE: // %bb.0:
; SVE-NEXT: ldr z0, [x1]
; SVE-NEXT: ldr z1, [x2]
; SVE-NEXT: ptrue p0.d
; SVE-NEXT: ldr z4, [x0]
; SVE-NEXT: uunpklo z2.d, z0.s
; SVE-NEXT: uunpklo z3.d, z1.s
; SVE-NEXT: uunpkhi z0.d, z0.s
; SVE-NEXT: uunpkhi z1.d, z1.s
; SVE-NEXT: mad z2.d, p0/m, z3.d, z4.d
; SVE-NEXT: mad z0.d, p0/m, z1.d, z2.d
; SVE-NEXT: movprfx z1, z0
; SVE-NEXT: ext z1.b, z1.b, z0.b, #16
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: two_way_i32_i64_vl256:
; SME: // %bb.0:
; SME-NEXT: ldr z0, [x0]
; SME-NEXT: ldr z1, [x1]
; SME-NEXT: ldr z2, [x2]
; SME-NEXT: umlalb z0.d, z2.s, z1.s
; SME-NEXT: umlalt z0.d, z2.s, z1.s
; SME-NEXT: movprfx z1, z0
; SME-NEXT: ext z1.b, z1.b, z0.b, #16
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <4 x i64>, ptr %accptr
%u = load <8 x i32>, ptr %uptr
%s = load <8 x i32>, ptr %sptr
%u.wide = zext <8 x i32> %u to <8 x i64>
%s.wide = zext <8 x i32> %s to <8 x i64>
%mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <8 x i64> %mult)
ret <4 x i64> %partial.reduce
}
;
; Four-way dot (i8 -> i32)
;
define <4 x i32> @four_way_i8_i32_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: four_way_i8_i32_vl128:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q0, [x0]
; COMMON-NEXT: ldr q1, [x1]
; COMMON-NEXT: ldr q2, [x2]
; COMMON-NEXT: udot v0.4s, v2.16b, v1.16b
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i8_i32_vl128:
; SME: // %bb.0:
; SME-NEXT: ldr q0, [x0]
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: udot z0.s, z2.b, z1.b
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <4 x i32>, ptr %accptr
%u = load <16 x i8>, ptr %uptr
%s = load <16 x i8>, ptr %sptr
%u.wide = zext <16 x i8> %u to <16 x i32>
%s.wide = zext <16 x i8> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
ret <4 x i32> %partial.reduce
}
define <4 x i32> @four_way_i8_i32_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
; COMMON-LABEL: four_way_i8_i32_vl128_usdot:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q0, [x0]
; COMMON-NEXT: ldr q1, [x1]
; COMMON-NEXT: ldr q2, [x2]
; COMMON-NEXT: usdot v0.4s, v1.16b, v2.16b
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i8_i32_vl128_usdot:
; SME: // %bb.0:
; SME-NEXT: ldr q0, [x0]
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: usdot z0.s, z1.b, z2.b
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <4 x i32>, ptr %accptr
%u = load <16 x i8>, ptr %uptr
%s = load <16 x i8>, ptr %sptr
%u.wide = zext <16 x i8> %u to <16 x i32>
%s.wide = sext <16 x i8> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
ret <4 x i32> %partial.reduce
}
define <4 x i32> @four_way_i8_i32_vl128_sudot(ptr %accptr, ptr %uptr, ptr %sptr) {
; COMMON-LABEL: four_way_i8_i32_vl128_sudot:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q0, [x0]
; COMMON-NEXT: ldr q1, [x1]
; COMMON-NEXT: ldr q2, [x2]
; COMMON-NEXT: usdot v0.4s, v2.16b, v1.16b
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i8_i32_vl128_sudot:
; SME: // %bb.0:
; SME-NEXT: ldr q0, [x0]
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: usdot z0.s, z2.b, z1.b
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <4 x i32>, ptr %accptr
%u = load <16 x i8>, ptr %uptr
%s = load <16 x i8>, ptr %sptr
%u.wide = sext <16 x i8> %u to <16 x i32>
%s.wide = zext <16 x i8> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
ret <4 x i32> %partial.reduce
}
define <2 x i64> @four_way_i8_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
; NEON-LABEL: four_way_i8_i64_vl128_usdot:
; NEON: // %bb.0:
; NEON-NEXT: movi v0.2d, #0000000000000000
; NEON-NEXT: ldr q1, [x1]
; NEON-NEXT: ldr q2, [x2]
; NEON-NEXT: usdot v0.4s, v1.16b, v2.16b
; NEON-NEXT: ldr q1, [x0]
; NEON-NEXT: saddw v1.2d, v1.2d, v0.2s
; NEON-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; NEON-NEXT: ret
;
; SVE-LABEL: four_way_i8_i64_vl128_usdot:
; SVE: // %bb.0:
; SVE-NEXT: movi v0.2d, #0000000000000000
; SVE-NEXT: ldr q1, [x1]
; SVE-NEXT: ldr q2, [x2]
; SVE-NEXT: usdot z0.s, z1.b, z2.b
; SVE-NEXT: ldr q2, [x0]
; SVE-NEXT: sunpklo z1.d, z0.s
; SVE-NEXT: sunpkhi z0.d, z0.s
; SVE-NEXT: add z1.d, z2.d, z1.d
; SVE-NEXT: add z0.d, z1.d, z0.d
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: ret
;
; SME-LABEL: four_way_i8_i64_vl128_usdot:
; SME: // %bb.0:
; SME-NEXT: mov z0.s, #0 // =0x0
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: usdot z0.s, z1.b, z2.b
; SME-NEXT: ldr q1, [x0]
; SME-NEXT: saddwb z1.d, z1.d, z0.s
; SME-NEXT: saddwt z0.d, z1.d, z0.s
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <2 x i64>, ptr %accptr
%u = load <16 x i8>, ptr %uptr
%s = load <16 x i8>, ptr %sptr
%u.wide = zext <16 x i8> %u to <16 x i64>
%s.wide = sext <16 x i8> %s to <16 x i64>
%mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
%partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
ret <2 x i64> %partial.reduce
}
define <2 x i64> @four_way_i16_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
; COMMON-LABEL: four_way_i16_i64_vl128_usdot:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q1, [x1]
; COMMON-NEXT: ldr q2, [x2]
; COMMON-NEXT: ldr q0, [x0]
; COMMON-NEXT: ushll v3.4s, v1.4h, #0
; COMMON-NEXT: sshll v4.4s, v2.4h, #0
; COMMON-NEXT: ushll2 v1.4s, v1.8h, #0
; COMMON-NEXT: sshll2 v2.4s, v2.8h, #0
; COMMON-NEXT: smlal v0.2d, v4.2s, v3.2s
; COMMON-NEXT: smlal2 v0.2d, v4.4s, v3.4s
; COMMON-NEXT: smlal v0.2d, v2.2s, v1.2s
; COMMON-NEXT: smlal2 v0.2d, v2.4s, v1.4s
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i16_i64_vl128_usdot:
; SME: // %bb.0:
; SME-NEXT: ptrue p0.d, vl2
; SME-NEXT: ldr q2, [x0]
; SME-NEXT: mov x8, #2 // =0x2
; SME-NEXT: ld1h { z0.d }, p0/z, [x1]
; SME-NEXT: ld1sh { z1.d }, p0/z, [x2]
; SME-NEXT: mad z0.d, p0/m, z1.d, z2.d
; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
; SME-NEXT: mov x8, #4 // =0x4
; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d
; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
; SME-NEXT: mov x8, #6 // =0x6
; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d
; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <2 x i64>, ptr %accptr
%u = load <8 x i16>, ptr %uptr
%s = load <8 x i16>, ptr %sptr
%u.wide = zext <8 x i16> %u to <8 x i64>
%s.wide = sext <8 x i16> %s to <8 x i64>
%mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
%partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
ret <2 x i64> %partial.reduce
}
define <8 x i32> @four_way_i8_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: four_way_i8_i32_vl128_double_width:
; COMMON: // %bb.0:
; COMMON-NEXT: ldp q0, q1, [x0]
; COMMON-NEXT: ldp q3, q2, [x1]
; COMMON-NEXT: ldp q5, q4, [x2]
; COMMON-NEXT: udot v0.4s, v5.16b, v3.16b
; COMMON-NEXT: udot v1.4s, v4.16b, v2.16b
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i8_i32_vl128_double_width:
; SME: // %bb.0:
; SME-NEXT: ldp q0, q1, [x0]
; SME-NEXT: ldp q3, q2, [x1]
; SME-NEXT: ldp q5, q4, [x2]
; SME-NEXT: udot z0.s, z5.b, z3.b
; SME-NEXT: udot z1.s, z4.b, z2.b
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <8 x i32>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i32>
%s.wide = zext <32 x i8> %s to <32 x i32>
%mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
%partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
ret <8 x i32> %partial.reduce
}
define <8 x i32> @four_way_i8_i32_vl128_double_width_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: four_way_i8_i32_vl128_double_width_usdot:
; COMMON: // %bb.0:
; COMMON-NEXT: ldp q0, q1, [x0]
; COMMON-NEXT: ldp q3, q2, [x1]
; COMMON-NEXT: ldp q5, q4, [x2]
; COMMON-NEXT: usdot v0.4s, v3.16b, v5.16b
; COMMON-NEXT: usdot v1.4s, v2.16b, v4.16b
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i8_i32_vl128_double_width_usdot:
; SME: // %bb.0:
; SME-NEXT: ldp q0, q1, [x0]
; SME-NEXT: ldp q3, q2, [x1]
; SME-NEXT: ldp q5, q4, [x2]
; SME-NEXT: usdot z0.s, z3.b, z5.b
; SME-NEXT: usdot z1.s, z2.b, z4.b
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <8 x i32>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i32>
%s.wide = sext <32 x i8> %s to <32 x i32>
%mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
%partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
ret <8 x i32> %partial.reduce
}
define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
;
;
; NEON-LABEL: four_way_i8_i32_vl256:
; NEON: // %bb.0:
; NEON-NEXT: ldp q0, q1, [x0]
; NEON-NEXT: ldp q3, q2, [x1]
; NEON-NEXT: ldp q5, q4, [x2]
; NEON-NEXT: udot v0.4s, v5.16b, v3.16b
; NEON-NEXT: udot v1.4s, v4.16b, v2.16b
; NEON-NEXT: ret
;
; SVE-LABEL: four_way_i8_i32_vl256:
; SVE: // %bb.0:
; SVE-NEXT: ldr z0, [x0]
; SVE-NEXT: ldr z1, [x1]
; SVE-NEXT: ldr z2, [x2]
; SVE-NEXT: udot z0.s, z2.b, z1.b
; SVE-NEXT: movprfx z1, z0
; SVE-NEXT: ext z1.b, z1.b, z0.b, #16
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: four_way_i8_i32_vl256:
; SME: // %bb.0:
; SME-NEXT: ldr z0, [x0]
; SME-NEXT: ldr z1, [x1]
; SME-NEXT: ldr z2, [x2]
; SME-NEXT: udot z0.s, z2.b, z1.b
; SME-NEXT: movprfx z1, z0
; SME-NEXT: ext z1.b, z1.b, z0.b, #16
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <8 x i32>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i32>
%s.wide = zext <32 x i8> %s to <32 x i32>
%mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
%partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
ret <8 x i32> %partial.reduce
}
define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
;
;
; NEON-LABEL: four_way_i8_i32_vl256_usdot:
; NEON: // %bb.0:
; NEON-NEXT: ldp q0, q1, [x0]
; NEON-NEXT: ldp q3, q2, [x1]
; NEON-NEXT: ldp q5, q4, [x2]
; NEON-NEXT: usdot v0.4s, v3.16b, v5.16b
; NEON-NEXT: usdot v1.4s, v2.16b, v4.16b
; NEON-NEXT: ret
;
; SVE-LABEL: four_way_i8_i32_vl256_usdot:
; SVE: // %bb.0:
; SVE-NEXT: ldr z0, [x0]
; SVE-NEXT: ldr z1, [x1]
; SVE-NEXT: ldr z2, [x2]
; SVE-NEXT: usdot z0.s, z1.b, z2.b
; SVE-NEXT: movprfx z1, z0
; SVE-NEXT: ext z1.b, z1.b, z0.b, #16
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: four_way_i8_i32_vl256_usdot:
; SME: // %bb.0:
; SME-NEXT: ldr z0, [x0]
; SME-NEXT: ldr z1, [x1]
; SME-NEXT: ldr z2, [x2]
; SME-NEXT: usdot z0.s, z1.b, z2.b
; SME-NEXT: movprfx z1, z0
; SME-NEXT: ext z1.b, z1.b, z0.b, #16
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <8 x i32>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i32>
%s.wide = sext <32 x i8> %s to <32 x i32>
%mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
%partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
ret <8 x i32> %partial.reduce
}
;
; Four-way dot (i16 -> i64)
;
define <2 x i64> @four_way_i16_i64_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: four_way_i16_i64_vl128:
; COMMON: // %bb.0:
; COMMON-NEXT: ldr q0, [x1]
; COMMON-NEXT: ldr q1, [x2]
; COMMON-NEXT: ldr q3, [x0]
; COMMON-NEXT: umull v2.4s, v1.4h, v0.4h
; COMMON-NEXT: umull2 v0.4s, v1.8h, v0.8h
; COMMON-NEXT: uaddw v3.2d, v3.2d, v2.2s
; COMMON-NEXT: uaddw2 v1.2d, v3.2d, v2.4s
; COMMON-NEXT: uaddw v1.2d, v1.2d, v0.2s
; COMMON-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i16_i64_vl128:
; SME: // %bb.0:
; SME-NEXT: ldr q0, [x0]
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: udot z0.d, z2.h, z1.h
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <2 x i64>, ptr %accptr
%u = load <8 x i16>, ptr %uptr
%s = load <8 x i16>, ptr %sptr
%u.wide = zext <8 x i16> %u to <8 x i64>
%s.wide = zext <8 x i16> %s to <8 x i64>
%mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
%partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
ret <2 x i64> %partial.reduce
}
define <4 x i64> @four_way_i16_i64_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; COMMON-LABEL: four_way_i16_i64_vl128_double_width:
; COMMON: // %bb.0:
; COMMON-NEXT: ldp q0, q1, [x1]
; COMMON-NEXT: ldp q2, q3, [x2]
; COMMON-NEXT: ldp q7, q6, [x0]
; COMMON-NEXT: umull v4.4s, v3.4h, v1.4h
; COMMON-NEXT: umull v5.4s, v2.4h, v0.4h
; COMMON-NEXT: umull2 v1.4s, v3.8h, v1.8h
; COMMON-NEXT: umull2 v0.4s, v2.8h, v0.8h
; COMMON-NEXT: uaddw v7.2d, v7.2d, v5.2s
; COMMON-NEXT: uaddw v6.2d, v6.2d, v4.2s
; COMMON-NEXT: uaddw2 v2.2d, v7.2d, v5.4s
; COMMON-NEXT: uaddw2 v3.2d, v6.2d, v4.4s
; COMMON-NEXT: uaddw v2.2d, v2.2d, v0.2s
; COMMON-NEXT: uaddw v3.2d, v3.2d, v1.2s
; COMMON-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; COMMON-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; COMMON-NEXT: ret
;
; SME-LABEL: four_way_i16_i64_vl128_double_width:
; SME: // %bb.0:
; SME-NEXT: ldp q0, q1, [x0]
; SME-NEXT: ldp q3, q2, [x1]
; SME-NEXT: ldp q5, q4, [x2]
; SME-NEXT: udot z0.d, z5.h, z3.h
; SME-NEXT: udot z1.d, z4.h, z2.h
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <4 x i64>, ptr %accptr
%u = load <16 x i16>, ptr %uptr
%s = load <16 x i16>, ptr %sptr
%u.wide = zext <16 x i16> %u to <16 x i64>
%s.wide = zext <16 x i16> %s to <16 x i64>
%mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <16 x i64> %mult)
ret <4 x i64> %partial.reduce
}
define <4 x i64> @four_way_i16_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
;
;
; NEON-LABEL: four_way_i16_i64_vl256:
; NEON: // %bb.0:
; NEON-NEXT: ldp q0, q1, [x1]
; NEON-NEXT: ldp q2, q3, [x2]
; NEON-NEXT: ldp q7, q6, [x0]
; NEON-NEXT: umull v4.4s, v3.4h, v1.4h
; NEON-NEXT: umull v5.4s, v2.4h, v0.4h
; NEON-NEXT: umull2 v1.4s, v3.8h, v1.8h
; NEON-NEXT: umull2 v0.4s, v2.8h, v0.8h
; NEON-NEXT: uaddw v7.2d, v7.2d, v5.2s
; NEON-NEXT: uaddw v6.2d, v6.2d, v4.2s
; NEON-NEXT: uaddw2 v2.2d, v7.2d, v5.4s
; NEON-NEXT: uaddw2 v3.2d, v6.2d, v4.4s
; NEON-NEXT: uaddw v2.2d, v2.2d, v0.2s
; NEON-NEXT: uaddw v3.2d, v3.2d, v1.2s
; NEON-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; NEON-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; NEON-NEXT: ret
;
; SVE-LABEL: four_way_i16_i64_vl256:
; SVE: // %bb.0:
; SVE-NEXT: ldr z0, [x0]
; SVE-NEXT: ldr z1, [x1]
; SVE-NEXT: ldr z2, [x2]
; SVE-NEXT: udot z0.d, z2.h, z1.h
; SVE-NEXT: movprfx z1, z0
; SVE-NEXT: ext z1.b, z1.b, z0.b, #16
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: four_way_i16_i64_vl256:
; SME: // %bb.0:
; SME-NEXT: ldr z0, [x0]
; SME-NEXT: ldr z1, [x1]
; SME-NEXT: ldr z2, [x2]
; SME-NEXT: udot z0.d, z2.h, z1.h
; SME-NEXT: movprfx z1, z0
; SME-NEXT: ext z1.b, z1.b, z0.b, #16
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <4 x i64>, ptr %accptr
%u = load <16 x i16>, ptr %uptr
%s = load <16 x i16>, ptr %sptr
%u.wide = zext <16 x i16> %u to <16 x i64>
%s.wide = zext <16 x i16> %s to <16 x i64>
%mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <16 x i64> %mult)
ret <4 x i64> %partial.reduce
}
;
; Eight-way dot, requires two steps (i8 -> i64)
;
define <2 x i64> @eight_way_i8_i64_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; NEON-LABEL: eight_way_i8_i64_vl128:
; NEON: // %bb.0:
; NEON-NEXT: movi v0.2d, #0000000000000000
; NEON-NEXT: ldr q1, [x1]
; NEON-NEXT: ldr q2, [x2]
; NEON-NEXT: udot v0.4s, v2.16b, v1.16b
; NEON-NEXT: ldr q1, [x0]
; NEON-NEXT: uaddw v1.2d, v1.2d, v0.2s
; NEON-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; NEON-NEXT: ret
;
; SVE-LABEL: eight_way_i8_i64_vl128:
; SVE: // %bb.0:
; SVE-NEXT: movi v0.2d, #0000000000000000
; SVE-NEXT: ldr q1, [x1]
; SVE-NEXT: ldr q2, [x2]
; SVE-NEXT: udot z0.s, z2.b, z1.b
; SVE-NEXT: ldr q2, [x0]
; SVE-NEXT: uunpklo z1.d, z0.s
; SVE-NEXT: uunpkhi z0.d, z0.s
; SVE-NEXT: add z1.d, z2.d, z1.d
; SVE-NEXT: add z0.d, z1.d, z0.d
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: ret
;
; SME-LABEL: eight_way_i8_i64_vl128:
; SME: // %bb.0:
; SME-NEXT: mov z0.s, #0 // =0x0
; SME-NEXT: ldr q1, [x1]
; SME-NEXT: ldr q2, [x2]
; SME-NEXT: udot z0.s, z2.b, z1.b
; SME-NEXT: ldr q1, [x0]
; SME-NEXT: uaddwb z1.d, z1.d, z0.s
; SME-NEXT: uaddwt z0.d, z1.d, z0.s
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: ret
%acc = load <2 x i64>, ptr %accptr
%u = load <16 x i8>, ptr %uptr
%s = load <16 x i8>, ptr %sptr
%u.wide = zext <16 x i8> %u to <16 x i64>
%s.wide = zext <16 x i8> %s to <16 x i64>
%mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
%partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
ret <2 x i64> %partial.reduce
}
define <4 x i64> @four_way_i8_i64_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
;
; NEON-LABEL: four_way_i8_i64_vl128_double_width:
; NEON: // %bb.0:
; NEON-NEXT: movi v1.2d, #0000000000000000
; NEON-NEXT: movi v0.2d, #0000000000000000
; NEON-NEXT: ldp q3, q2, [x1]
; NEON-NEXT: ldp q5, q4, [x2]
; NEON-NEXT: udot v0.4s, v5.16b, v3.16b
; NEON-NEXT: udot v1.4s, v4.16b, v2.16b
; NEON-NEXT: ldp q3, q2, [x0]
; NEON-NEXT: uaddw v3.2d, v3.2d, v0.2s
; NEON-NEXT: uaddw v2.2d, v2.2d, v1.2s
; NEON-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; NEON-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; NEON-NEXT: ret
;
; SVE-LABEL: four_way_i8_i64_vl128_double_width:
; SVE: // %bb.0:
; SVE-NEXT: movi v0.2d, #0000000000000000
; SVE-NEXT: movi v1.2d, #0000000000000000
; SVE-NEXT: ldp q3, q2, [x1]
; SVE-NEXT: ldp q5, q4, [x2]
; SVE-NEXT: udot z1.s, z5.b, z3.b
; SVE-NEXT: udot z0.s, z4.b, z2.b
; SVE-NEXT: ldp q5, q4, [x0]
; SVE-NEXT: uunpklo z2.d, z1.s
; SVE-NEXT: uunpklo z3.d, z0.s
; SVE-NEXT: uunpkhi z1.d, z1.s
; SVE-NEXT: uunpkhi z6.d, z0.s
; SVE-NEXT: add z0.d, z5.d, z2.d
; SVE-NEXT: add z2.d, z4.d, z3.d
; SVE-NEXT: add z0.d, z0.d, z1.d
; SVE-NEXT: add z1.d, z2.d, z6.d
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: four_way_i8_i64_vl128_double_width:
; SME: // %bb.0:
; SME-NEXT: mov z1.s, #0 // =0x0
; SME-NEXT: mov z0.s, #0 // =0x0
; SME-NEXT: ldp q3, q2, [x1]
; SME-NEXT: ldp q5, q4, [x2]
; SME-NEXT: udot z0.s, z5.b, z3.b
; SME-NEXT: udot z1.s, z4.b, z2.b
; SME-NEXT: ldp q3, q2, [x0]
; SME-NEXT: uaddwb z3.d, z3.d, z0.s
; SME-NEXT: uaddwb z2.d, z2.d, z1.s
; SME-NEXT: uaddwt z0.d, z3.d, z0.s
; SME-NEXT: uaddwt z1.d, z2.d, z1.s
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <4 x i64>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i64>
%s.wide = zext <32 x i8> %s to <32 x i64>
%mult = mul nuw nsw <32 x i64> %s.wide, %u.wide
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <32 x i64> %mult)
ret <4 x i64> %partial.reduce
}
define <4 x i64> @four_way_i8_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
; NEON-LABEL: four_way_i8_i64_vl256:
; NEON: // %bb.0:
; NEON-NEXT: movi v1.2d, #0000000000000000
; NEON-NEXT: movi v0.2d, #0000000000000000
; NEON-NEXT: ldp q3, q2, [x1]
; NEON-NEXT: ldp q5, q4, [x2]
; NEON-NEXT: udot v0.4s, v5.16b, v3.16b
; NEON-NEXT: udot v1.4s, v4.16b, v2.16b
; NEON-NEXT: ldp q3, q2, [x0]
; NEON-NEXT: uaddw v3.2d, v3.2d, v0.2s
; NEON-NEXT: uaddw v2.2d, v2.2d, v1.2s
; NEON-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; NEON-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; NEON-NEXT: ret
;
; SVE-LABEL: four_way_i8_i64_vl256:
; SVE: // %bb.0:
; SVE-NEXT: movi v0.2d, #0000000000000000
; SVE-NEXT: ldr z1, [x1]
; SVE-NEXT: ldr z2, [x2]
; SVE-NEXT: udot z0.s, z2.b, z1.b
; SVE-NEXT: ldr z2, [x0]
; SVE-NEXT: uunpklo z1.d, z0.s
; SVE-NEXT: uunpkhi z0.d, z0.s
; SVE-NEXT: add z1.d, z2.d, z1.d
; SVE-NEXT: add z0.d, z1.d, z0.d
; SVE-NEXT: movprfx z1, z0
; SVE-NEXT: ext z1.b, z1.b, z0.b, #16
; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: // kill: def $q1 killed $q1 killed $z1
; SVE-NEXT: ret
;
; SME-LABEL: four_way_i8_i64_vl256:
; SME: // %bb.0:
; SME-NEXT: ldr z0, [x1]
; SME-NEXT: ldr z1, [x2]
; SME-NEXT: mov z2.s, #0 // =0x0
; SME-NEXT: udot z2.s, z1.b, z0.b
; SME-NEXT: ldr z0, [x0]
; SME-NEXT: uaddwb z0.d, z0.d, z2.s
; SME-NEXT: uaddwt z0.d, z0.d, z2.s
; SME-NEXT: movprfx z1, z0
; SME-NEXT: ext z1.b, z1.b, z0.b, #16
; SME-NEXT: // kill: def $q0 killed $q0 killed $z0
; SME-NEXT: // kill: def $q1 killed $q1 killed $z1
; SME-NEXT: ret
%acc = load <4 x i64>, ptr %accptr
%u = load <32 x i8>, ptr %uptr
%s = load <32 x i8>, ptr %sptr
%u.wide = zext <32 x i8> %u to <32 x i64>
%s.wide = zext <32 x i8> %s to <32 x i64>
%mult = mul nuw nsw <32 x i64> %s.wide, %u.wide
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add(<4 x i64> %acc, <32 x i64> %mult)
ret <4 x i64> %partial.reduce
}