Ruiling, Song 0487db1f13
MachineScheduler: Improve instruction clustering (#137784)
The existing way of managing clustered nodes was done through adding
weak edges between the neighbouring cluster nodes, which is a sort of
ordered queue. And this will be later recorded as `NextClusterPred` or
`NextClusterSucc` in `ScheduleDAGMI`.

But actually the instruction may be picked not in the exact order of the
queue. For example, we have a queue of cluster nodes A B C. But during
scheduling, node B might be picked first, then it will be very likely
that we only cluster B and C for Top-Down scheduling (leaving A alone).

Another issue is:
```
   if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
      std::swap(SUa, SUb);
   if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
```
may break the cluster queue.

For example, we want to cluster nodes (order as in `MemOpRecords`): 1 3
2. 1(SUa) will be pred of 3(SUb) normally. But when it comes to (3, 2),
As 3(SUa) > 2(SUb), we would reorder the two nodes, which makes 2 be
pred of 3. This makes both 1 and 2 become preds of 3, but there is no
edge between 1 and 2. Thus we get a broken cluster chain.

To fix both issues, we introduce an unordered set in the change. This
could help improve clustering in some hard case.

One key reason the change causes so many test check changes is: As the
cluster candidates are not ordered now, the candidates might be picked
in different order from before.

The most affected targets are: AMDGPU, AArch64, RISCV.

For RISCV, it seems to me most are just minor instruction reorder, don't
see obvious regression.

For AArch64, there were some combining of ldr into ldp being affected.
With two cases being regressed and two being improved. This has more
deeper reason that machine scheduler cannot cluster them well both
before and after the change, and the load combine algorithm later is
also not smart enough.

For AMDGPU, some cases have more v_dual instructions used while some are
regressed. It seems less critical. Seems like test `v_vselect_v32bf16`
gets more buffer_load being claused.
2025-06-05 15:28:04 +08:00

1416 lines
77 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s
define <4 x i16> @normal_load_v4i8(ptr %p) {
; CHECK-LABEL: normal_load_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%l1 = load <4 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 4
%l2 = load <4 x i8>, ptr %q
%e1 = zext <4 x i8> %l1 to <4 x i16>
%e2 = zext <4 x i8> %l2 to <4 x i16>
%a = add <4 x i16> %e1, %e2
ret <4 x i16> %a
}
define <4 x i32> @normal_load_v4i16_v4i32(ptr %p) {
; CHECK-LABEL: normal_load_v4i16_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp d0, d1, [x0]
; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-NEXT: ret
%l1 = load <4 x i16>, ptr %p
%q = getelementptr i8, ptr %p, i32 8
%l2 = load <4 x i16>, ptr %q
%e1 = zext <4 x i16> %l1 to <4 x i32>
%e2 = zext <4 x i16> %l2 to <4 x i32>
%a = add <4 x i32> %e1, %e2
ret <4 x i32> %a
}
define <4 x i16> @load_v4i8(ptr %p) {
; CHECK-LABEL: load_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s1, s0, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: shl v0.4h, v0.4h, #3
; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%l1 = load <4 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 4
%l2 = load <4 x i8>, ptr %q
%e1 = zext <4 x i8> %l1 to <4 x i16>
%e2 = zext <4 x i8> %l2 to <4 x i16>
%e3 = shl <4 x i16> %e2, <i16 3, i16 3, i16 3, i16 3>
%a = add <4 x i16> %e1, %e3
ret <4 x i16> %a
}
define <4 x i32> @load_v4i16_v4i32(ptr %p) {
; CHECK-LABEL: load_v4i16_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
; CHECK-NEXT: ret
%l1 = load <4 x i16>, ptr %p
%q = getelementptr i8, ptr %p, i32 8
%l2 = load <4 x i16>, ptr %q
%e1 = zext <4 x i16> %l1 to <4 x i32>
%e2 = zext <4 x i16> %l2 to <4 x i32>
%e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
%a = add <4 x i32> %e1, %e3
ret <4 x i32> %a
}
define <4 x i64> @load_v4i32_v4i64(ptr %p) {
; CHECK-LABEL: load_v4i32_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q2, q0, [x0]
; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3
; CHECK-NEXT: ushll v0.2d, v0.2s, #3
; CHECK-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
; CHECK-NEXT: ret
%l1 = load <4 x i32>, ptr %p
%q = getelementptr i8, ptr %p, i32 16
%l2 = load <4 x i32>, ptr %q
%e1 = zext <4 x i32> %l1 to <4 x i64>
%e2 = zext <4 x i32> %l2 to <4 x i64>
%e3 = shl <4 x i64> %e2, <i64 3, i64 3, i64 3, i64 3>
%a = add <4 x i64> %e1, %e3
ret <4 x i64> %a
}
define <4 x i32> @load_v4i8_v4i32(ptr %p) {
; CHECK-LABEL: load_v4i8_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
; CHECK-NEXT: ret
%l1 = load <4 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 4
%l2 = load <4 x i8>, ptr %q
%e1 = zext <4 x i8> %l1 to <4 x i32>
%e2 = zext <4 x i8> %l2 to <4 x i32>
%e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
%a = add <4 x i32> %e1, %e3
ret <4 x i32> %a
}
define <4 x i32> @load_v4i12_v4i32(ptr %p) {
; CHECK-LABEL: load_v4i12_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr x8, [x0]
; CHECK-NEXT: ldr w9, [x0, #8]
; CHECK-NEXT: lsr x10, x8, #60
; CHECK-NEXT: ubfx x11, x8, #48, #12
; CHECK-NEXT: ubfx w12, w9, #8, #12
; CHECK-NEXT: orr w10, w10, w9, lsl #4
; CHECK-NEXT: fmov s0, w11
; CHECK-NEXT: and w11, w8, #0xfff
; CHECK-NEXT: fmov s1, w11
; CHECK-NEXT: lsr x9, x9, #20
; CHECK-NEXT: and w10, w10, #0xfff
; CHECK-NEXT: mov v0.h[1], w10
; CHECK-NEXT: ubfx w10, w8, #12, #12
; CHECK-NEXT: mov v1.h[1], w10
; CHECK-NEXT: ubfx x10, x8, #24, #12
; CHECK-NEXT: ubfx x8, x8, #36, #12
; CHECK-NEXT: mov v0.h[2], w12
; CHECK-NEXT: mov v1.h[2], w10
; CHECK-NEXT: mov v0.h[3], w9
; CHECK-NEXT: mov v1.h[3], w8
; CHECK-NEXT: ushll v0.4s, v0.4h, #3
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: ret
%l1 = load <4 x i12>, ptr %p
%q = getelementptr i8, ptr %p, i32 6
%l2 = load <4 x i12>, ptr %q
%e1 = zext <4 x i12> %l1 to <4 x i32>
%e2 = zext <4 x i12> %l2 to <4 x i32>
%e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
%a = add <4 x i32> %e1, %e3
ret <4 x i32> %a
}
define <8 x i16> @load_v8i8(ptr %p) {
; CHECK-LABEL: load_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3
; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b
; CHECK-NEXT: ret
%l1 = load <8 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 8
%l2 = load <8 x i8>, ptr %q
%e1 = zext <8 x i8> %l1 to <8 x i16>
%e2 = zext <8 x i8> %l2 to <8 x i16>
%e3 = shl <8 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%a = add <8 x i16> %e1, %e3
ret <8 x i16> %a
}
define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) {
; CHECK-LABEL: loadadd_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3
; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b
; CHECK-NEXT: ret
%l11 = load <8 x i8>, ptr %p1
%q1 = getelementptr i8, ptr %p1, i32 8
%l12 = load <8 x i8>, ptr %q1
%l21 = load <8 x i8>, ptr %p2
%q2 = getelementptr i8, ptr %p2, i32 8
%l22 = load <8 x i8>, ptr %q2
%l1 = add <8 x i8> %l11, %l21
%l2 = add <8 x i8> %l12, %l22
%e1 = zext <8 x i8> %l1 to <8 x i16>
%e2 = zext <8 x i8> %l2 to <8 x i16>
%e3 = shl <8 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%a = add <8 x i16> %e1, %e3
ret <8 x i16> %a
}
define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) {
; CHECK-LABEL: loadaddext_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3
; CHECK-NEXT: ushll v2.4s, v2.4h, #3
; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h
; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h
; CHECK-NEXT: ret
%l11 = load <8 x i8>, ptr %p1
%q1 = getelementptr i8, ptr %p1, i32 8
%l12 = load <8 x i8>, ptr %q1
%l21 = load <8 x i8>, ptr %p2
%q2 = getelementptr i8, ptr %p2, i32 8
%l22 = load <8 x i8>, ptr %q2
%le11 = zext <8 x i8> %l11 to <8 x i16>
%le12 = zext <8 x i8> %l12 to <8 x i16>
%le21 = zext <8 x i8> %l21 to <8 x i16>
%le22 = zext <8 x i8> %l22 to <8 x i16>
%l1 = add <8 x i16> %le11, %le21
%l2 = add <8 x i16> %le12, %le22
%e1 = zext <8 x i16> %l1 to <8 x i32>
%e2 = zext <8 x i16> %l2 to <8 x i32>
%e3 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <8 x i32> %e1, %e3
ret <8 x i32> %a
}
define <4 x i32> @loadaddext_v4i8(ptr %p1, ptr %p2) {
; CHECK-LABEL: loadaddext_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
; CHECK-NEXT: ret
%l11 = load <4 x i8>, ptr %p1
%q1 = getelementptr i8, ptr %p1, i32 4
%l12 = load <4 x i8>, ptr %q1
%l21 = load <4 x i8>, ptr %p2
%q2 = getelementptr i8, ptr %p2, i32 4
%l22 = load <4 x i8>, ptr %q2
%le11 = zext <4 x i8> %l11 to <4 x i16>
%le12 = zext <4 x i8> %l12 to <4 x i16>
%le21 = zext <4 x i8> %l21 to <4 x i16>
%le22 = zext <4 x i8> %l22 to <4 x i16>
%l1 = add <4 x i16> %le11, %le21
%l2 = add <4 x i16> %le12, %le22
%e1 = zext <4 x i16> %l1 to <4 x i32>
%e2 = zext <4 x i16> %l2 to <4 x i32>
%e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
%a = add <4 x i32> %e1, %e3
ret <4 x i32> %a
}
define <16 x i16> @load_v16i8(ptr %p) {
; CHECK-LABEL: load_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q2, q0, [x0]
; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3
; CHECK-NEXT: ushll v0.8h, v0.8b, #3
; CHECK-NEXT: uaddw2 v1.8h, v1.8h, v2.16b
; CHECK-NEXT: uaddw v0.8h, v0.8h, v2.8b
; CHECK-NEXT: ret
%l1 = load <16 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 16
%l2 = load <16 x i8>, ptr %q
%e1 = zext <16 x i8> %l1 to <16 x i16>
%e2 = zext <16 x i8> %l2 to <16 x i16>
%e3 = shl <16 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%a = add <16 x i16> %e1, %e3
ret <16 x i16> %a
}
define <2 x i16> @std_v2i8_v2i16(ptr %p) {
; CHECK-LABEL: std_v2i8_v2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrb w9, [x0, #3]
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ldrb w8, [x0]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: mov v0.s[1], w9
; CHECK-NEXT: ldrb w9, [x0, #1]
; CHECK-NEXT: mov v1.s[1], w9
; CHECK-NEXT: shl v0.2s, v0.2s, #3
; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
; CHECK-NEXT: ret
%l1 = load <2 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 2
%l2 = load <2 x i8>, ptr %q
%e1 = zext <2 x i8> %l1 to <2 x i16>
%e2 = zext <2 x i8> %l2 to <2 x i16>
%se2 = shl <2 x i16> %e2, <i16 3, i16 3>
%a = add <2 x i16> %e1, %se2
ret <2 x i16> %a
}
define <8 x i16> @load_bv_v4i8(ptr %p, ptr %q) {
; CHECK-LABEL: load_bv_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
; CHECK-NEXT: ushll v1.8h, v1.8b, #3
; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b
; CHECK-NEXT: ret
%j1 = load <4 x i8>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 4
%j2 = load <4 x i8>, ptr %p1
%k1 = load <4 x i8>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 4
%k2 = load <4 x i8>, ptr %q1
%l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%e1 = zext <8 x i8> %l1 to <8 x i16>
%e2 = zext <8 x i8> %l2 to <8 x i16>
%e3 = shl <8 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%a = add <8 x i16> %e1, %e3
ret <8 x i16> %a
}
define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) {
; CHECK-LABEL: load_bv_v4i8_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3
; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3
; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h
; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h
; CHECK-NEXT: ret
%j1 = load <4 x i8>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 4
%j2 = load <4 x i8>, ptr %p1
%k1 = load <4 x i8>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 4
%k2 = load <4 x i8>, ptr %q1
%l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%e1 = zext <8 x i8> %l1 to <8 x i32>
%e2 = zext <8 x i8> %l2 to <8 x i32>
%e3 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <8 x i32> %e1, %e3
ret <8 x i32> %a
}
define <8 x i32> @load_bv_v4i16_i32(ptr %p, ptr %q) {
; CHECK-LABEL: load_bv_v4i16_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3
; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3
; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h
; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h
; CHECK-NEXT: ret
%j1 = load <4 x i16>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 8
%j2 = load <4 x i16>, ptr %p1
%k1 = load <4 x i16>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 8
%k2 = load <4 x i16>, ptr %q1
%l1 = shufflevector <4 x i16> %j1, <4 x i16> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%l2 = shufflevector <4 x i16> %j2, <4 x i16> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%e1 = zext <8 x i16> %l1 to <8 x i32>
%e2 = zext <8 x i16> %l2 to <8 x i32>
%e3 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <8 x i32> %e1, %e3
ret <8 x i32> %a
}
define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) {
; CHECK-LABEL: load_bv_3xv4i8_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
; CHECK-NEXT: ldp s3, s2, [x2]
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: ushll v3.8h, v3.8b, #0
; CHECK-NEXT: ushll2 v4.4s, v1.8h, #3
; CHECK-NEXT: ushll v1.4s, v1.4h, #3
; CHECK-NEXT: ushll v2.4s, v2.4h, #3
; CHECK-NEXT: uaddw v2.4s, v2.4s, v3.4h
; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v0.8h
; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
; CHECK-NEXT: stp q3, q2, [x8, #16]
; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: ret
%j1 = load <4 x i8>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 4
%j2 = load <4 x i8>, ptr %p1
%k1 = load <4 x i8>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 4
%k2 = load <4 x i8>, ptr %q1
%m1 = load <4 x i8>, ptr %r
%r1 = getelementptr i8, ptr %r, i32 4
%m2 = load <4 x i8>, ptr %r1
%jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn1 = shufflevector <4 x i8> %m1, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn2 = shufflevector <4 x i8> %m2, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%e1 = zext <12 x i8> %l1 to <12 x i32>
%e2 = zext <12 x i8> %l2 to <12 x i32>
%e3 = shl <12 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <12 x i32> %e1, %e3
ret <12 x i32> %a
}
define <16 x i16> @load_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
; CHECK-LABEL: load_bv_4xv4i8_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
; CHECK-NEXT: ld1 { v3.s }[1], [x3]
; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b
; CHECK-NEXT: ret
%j1 = load <4 x i8>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 4
%j2 = load <4 x i8>, ptr %p1
%k1 = load <4 x i8>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 4
%k2 = load <4 x i8>, ptr %q1
%m1 = load <4 x i8>, ptr %r
%r1 = getelementptr i8, ptr %r, i32 4
%m2 = load <4 x i8>, ptr %r1
%n1 = load <4 x i8>, ptr %s
%s1 = getelementptr i8, ptr %s, i32 4
%n2 = load <4 x i8>, ptr %s1
%jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%e1 = zext <16 x i8> %l1 to <16 x i16>
%e2 = zext <16 x i8> %l2 to <16 x i16>
%e3 = shl <16 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%a = add <16 x i16> %e1, %e2
ret <16 x i16> %a
}
define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
; CHECK-LABEL: double_bv_2xv4i8_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
; CHECK-NEXT: ld1 { v3.s }[1], [x3]
; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b
; CHECK-NEXT: shll v3.4s, v2.4h, #16
; CHECK-NEXT: shll2 v1.4s, v2.8h, #16
; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h
; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h
; CHECK-NEXT: ret
%j1 = load <4 x i8>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 4
%j2 = load <4 x i8>, ptr %p1
%k1 = load <4 x i8>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 4
%k2 = load <4 x i8>, ptr %q1
%m1 = load <4 x i8>, ptr %r
%r1 = getelementptr i8, ptr %r, i32 4
%m2 = load <4 x i8>, ptr %r1
%n1 = load <4 x i8>, ptr %s
%s1 = getelementptr i8, ptr %s, i32 4
%n2 = load <4 x i8>, ptr %s1
%jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%ejk1 = zext <8 x i8> %jk1 to <8 x i16>
%ejk2 = zext <8 x i8> %jk2 to <8 x i16>
%ajk = sub <8 x i16> %ejk1, %ejk2
%enm1 = zext <8 x i8> %mn1 to <8 x i16>
%enm2 = zext <8 x i8> %mn2 to <8 x i16>
%anm = sub <8 x i16> %enm1, %enm2
%x = sext <8 x i16> %ajk to <8 x i32>
%y = zext <8 x i16> %anm to <8 x i32>
%ys = shl <8 x i32> %y, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%a = add <8 x i32> %x, %ys
ret <8 x i32> %a
}
define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) {
; CHECK-LABEL: double_bv_4xv4i8_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s1, [x0]
; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v1.s }[1], [x1]
; CHECK-NEXT: ldp s2, s3, [x2]
; CHECK-NEXT: usubl v1.8h, v0.8b, v1.8b
; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
; CHECK-NEXT: ld1 { v3.s }[1], [x3]
; CHECK-NEXT: ldp s4, s5, [x4]
; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b
; CHECK-NEXT: ld1 { v4.s }[1], [x5], #4
; CHECK-NEXT: ld1 { v5.s }[1], [x5]
; CHECK-NEXT: ldp s6, s7, [x6]
; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b
; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4
; CHECK-NEXT: ld1 { v7.s }[1], [x7]
; CHECK-NEXT: shll v0.4s, v4.4h, #16
; CHECK-NEXT: shll2 v4.4s, v4.8h, #16
; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b
; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: saddw2 v1.4s, v4.4s, v1.8h
; CHECK-NEXT: shll v6.4s, v5.4h, #16
; CHECK-NEXT: shll2 v3.4s, v5.8h, #16
; CHECK-NEXT: saddw2 v3.4s, v3.4s, v2.8h
; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
; CHECK-NEXT: ret
%j1 = load <4 x i8>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 4
%j2 = load <4 x i8>, ptr %p1
%k1 = load <4 x i8>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 4
%k2 = load <4 x i8>, ptr %q1
%m1 = load <4 x i8>, ptr %r
%r1 = getelementptr i8, ptr %r, i32 4
%m2 = load <4 x i8>, ptr %r1
%n1 = load <4 x i8>, ptr %s
%s1 = getelementptr i8, ptr %s, i32 4
%n2 = load <4 x i8>, ptr %s1
%j3 = load <4 x i8>, ptr %t
%t3 = getelementptr i8, ptr %t, i32 4
%j4 = load <4 x i8>, ptr %t3
%k3 = load <4 x i8>, ptr %u
%u3 = getelementptr i8, ptr %u, i32 4
%k4 = load <4 x i8>, ptr %u3
%m3 = load <4 x i8>, ptr %v
%v3 = getelementptr i8, ptr %v, i32 4
%m4 = load <4 x i8>, ptr %v3
%n3 = load <4 x i8>, ptr %w
%w3 = getelementptr i8, ptr %w, i32 4
%n4 = load <4 x i8>, ptr %w3
%jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn3 = shufflevector <4 x i8> %m3, <4 x i8> %n3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mn4 = shufflevector <4 x i8> %m4, <4 x i8> %n4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%l3 = shufflevector <8 x i8> %jk3, <8 x i8> %mn3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%l4 = shufflevector <8 x i8> %jk4, <8 x i8> %mn4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%ejk1 = zext <16 x i8> %l1 to <16 x i16>
%ejk2 = zext <16 x i8> %l2 to <16 x i16>
%ajk = sub <16 x i16> %ejk1, %ejk2
%enm1 = zext <16 x i8> %l3 to <16 x i16>
%enm2 = zext <16 x i8> %l4 to <16 x i16>
%anm = sub <16 x i16> %enm1, %enm2
%x = sext <16 x i16> %ajk to <16 x i32>
%y = zext <16 x i16> %anm to <16 x i32>
%ys = shl <16 x i32> %y, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%a = add <16 x i32> %x, %ys
ret <16 x i32> %a
}
define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) {
; CHECK-LABEL: double2_bv_4xv4i8_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x2]
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: ldr d3, [x3]
; CHECK-NEXT: ldr d4, [x4]
; CHECK-NEXT: ldr d5, [x5]
; CHECK-NEXT: ldr d6, [x6]
; CHECK-NEXT: ldr d7, [x7]
; CHECK-NEXT: usubl v1.8h, v1.8b, v4.8b
; CHECK-NEXT: usubl v2.8h, v2.8b, v5.8b
; CHECK-NEXT: usubl v3.8h, v3.8b, v7.8b
; CHECK-NEXT: usubl v4.8h, v0.8b, v6.8b
; CHECK-NEXT: shll2 v0.4s, v1.8h, #16
; CHECK-NEXT: shll2 v5.4s, v2.8h, #16
; CHECK-NEXT: shll2 v6.4s, v4.8h, #16
; CHECK-NEXT: shll2 v7.4s, v3.8h, #16
; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: saddw v1.4s, v5.4s, v2.4h
; CHECK-NEXT: saddw v2.4s, v6.4s, v4.4h
; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h
; CHECK-NEXT: ret
%j1 = load <4 x i8>, ptr %p
%p1 = getelementptr i8, ptr %p, i32 4
%j2 = load <4 x i8>, ptr %p1
%k1 = load <4 x i8>, ptr %q
%q1 = getelementptr i8, ptr %q, i32 4
%k2 = load <4 x i8>, ptr %q1
%m1 = load <4 x i8>, ptr %r
%r1 = getelementptr i8, ptr %r, i32 4
%m2 = load <4 x i8>, ptr %r1
%n1 = load <4 x i8>, ptr %s
%s1 = getelementptr i8, ptr %s, i32 4
%n2 = load <4 x i8>, ptr %s1
%j3 = load <4 x i8>, ptr %t
%t3 = getelementptr i8, ptr %t, i32 4
%j4 = load <4 x i8>, ptr %t3
%k3 = load <4 x i8>, ptr %u
%u3 = getelementptr i8, ptr %u, i32 4
%k4 = load <4 x i8>, ptr %u3
%m3 = load <4 x i8>, ptr %v
%v3 = getelementptr i8, ptr %v, i32 4
%m4 = load <4 x i8>, ptr %v3
%n3 = load <4 x i8>, ptr %w
%w3 = getelementptr i8, ptr %w, i32 4
%n4 = load <4 x i8>, ptr %w3
%jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m1l = shufflevector <4 x i8> %m1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n1l = shufflevector <4 x i8> %n1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m2l = shufflevector <4 x i8> %m2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n2l = shufflevector <4 x i8> %n2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m3l = shufflevector <4 x i8> %m3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n3l = shufflevector <4 x i8> %n3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m4l = shufflevector <4 x i8> %m4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n4l = shufflevector <4 x i8> %n4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%ejk1 = zext <16 x i8> %l1 to <16 x i16>
%ejk2 = zext <16 x i8> %l3 to <16 x i16>
%ajk = sub <16 x i16> %ejk1, %ejk2
%enm1 = zext <16 x i8> %l2 to <16 x i16>
%enm2 = zext <16 x i8> %l4 to <16 x i16>
%anm = sub <16 x i16> %enm1, %enm2
%x = sext <16 x i16> %ajk to <16 x i32>
%y = zext <16 x i16> %anm to <16 x i32>
%ys = shl <16 x i32> %y, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%a = add <16 x i32> %x, %ys
ret <16 x i32> %a
}
define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-LABEL: extrause_load:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: str s1, [x4]
; CHECK-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-NEXT: ldr s0, [x2]
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: umov w9, v2.h[0]
; CHECK-NEXT: umov w10, v2.h[1]
; CHECK-NEXT: mov v0.b[8], w9
; CHECK-NEXT: umov w9, v2.h[2]
; CHECK-NEXT: mov v0.b[9], w10
; CHECK-NEXT: umov w10, v2.h[3]
; CHECK-NEXT: ldr s2, [x1]
; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b
; CHECK-NEXT: mov v0.b[10], w9
; CHECK-NEXT: add x9, x1, #4
; CHECK-NEXT: mov v1.d[1], v2.d[0]
; CHECK-NEXT: mov v0.b[11], w10
; CHECK-NEXT: add x10, x3, #12
; CHECK-NEXT: bic v1.8h, #255, lsl #8
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
; CHECK-NEXT: ldr s4, [x0, #12]
; CHECK-NEXT: ldp s5, s2, [x2, #4]
; CHECK-NEXT: ldr s6, [x2, #12]
; CHECK-NEXT: ldp s3, s7, [x0, #4]
; CHECK-NEXT: ld1 { v4.s }[1], [x11]
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
; CHECK-NEXT: ld1 { v2.s }[1], [x8]
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
; CHECK-NEXT: add x8, x1, #8
; CHECK-NEXT: ld1 { v3.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: uaddl v3.8h, v3.8b, v4.8b
; CHECK-NEXT: uaddl v4.8h, v5.8b, v6.8b
; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b
; CHECK-NEXT: uaddw2 v2.8h, v2.8h, v0.16b
; CHECK-NEXT: ushll v0.4s, v3.4h, #3
; CHECK-NEXT: ushll v5.4s, v4.4h, #3
; CHECK-NEXT: ushll2 v4.4s, v4.8h, #3
; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v2.8h
; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
store <4 x i8> %lp1, ptr %z
%p2 = getelementptr i8, ptr %p, i32 4
%lp2 = load <4 x i8>, ptr %p2
%p3 = getelementptr i8, ptr %p, i32 8
%lp3 = load <4 x i8>, ptr %p3
%p4 = getelementptr i8, ptr %p, i32 12
%lp4 = load <4 x i8>, ptr %p4
%lq1 = load <4 x i8>, ptr %q
%q2 = getelementptr i8, ptr %q, i32 4
%lq2 = load <4 x i8>, ptr %q2
%q3 = getelementptr i8, ptr %q, i32 8
%lq3 = load <4 x i8>, ptr %q3
%q4 = getelementptr i8, ptr %q, i32 12
%lq4 = load <4 x i8>, ptr %q4
%lr1 = load <4 x i8>, ptr %r
%r2 = getelementptr i8, ptr %r, i32 4
%lr2 = load <4 x i8>, ptr %r2
%r3 = getelementptr i8, ptr %r, i32 8
%lr3 = load <4 x i8>, ptr %r3
%r4 = getelementptr i8, ptr %r, i32 12
%lr4 = load <4 x i8>, ptr %r4
%ls1 = load <4 x i8>, ptr %s
%s2 = getelementptr i8, ptr %s, i32 4
%ls2 = load <4 x i8>, ptr %s2
%s3 = getelementptr i8, ptr %s, i32 8
%ls3 = load <4 x i8>, ptr %s3
%s4 = getelementptr i8, ptr %s, i32 12
%ls4 = load <4 x i8>, ptr %s4
%jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%le11 = zext <16 x i8> %l1 to <16 x i16>
%le12 = zext <16 x i8> %l3 to <16 x i16>
%le21 = zext <16 x i8> %l2 to <16 x i16>
%le22 = zext <16 x i8> %l4 to <16 x i16>
%la1 = add <16 x i16> %le11, %le12
%la2 = add <16 x i16> %le21, %le22
%e1 = zext <16 x i16> %la1 to <16 x i32>
%e2 = zext <16 x i16> %la2 to <16 x i32>
%se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <16 x i32> %e1, %se2
ret <16 x i32> %a
}
define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-LABEL: extrause_shuffle:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s17, s0, [x0, #8]
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: ldr s3, [x1, #12]
; CHECK-NEXT: ldp s2, s16, [x2]
; CHECK-NEXT: ldr s5, [x2, #12]
; CHECK-NEXT: add x9, x1, #8
; CHECK-NEXT: ldr s1, [x3, #12]
; CHECK-NEXT: mov v4.16b, v0.16b
; CHECK-NEXT: mov v0.s[1], v3.s[0]
; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
; CHECK-NEXT: ldp s6, s7, [x0]
; CHECK-NEXT: mov v4.s[1], v3.s[0]
; CHECK-NEXT: ld1 { v6.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v7.s }[1], [x1]
; CHECK-NEXT: ld1 { v16.s }[1], [x3]
; CHECK-NEXT: ldr s3, [x2, #8]
; CHECK-NEXT: ld1 { v17.s }[1], [x9]
; CHECK-NEXT: mov v4.s[2], v5.s[0]
; CHECK-NEXT: mov v5.s[1], v1.s[0]
; CHECK-NEXT: ld1 { v3.s }[1], [x8]
; CHECK-NEXT: uaddl v0.8h, v7.8b, v0.8b
; CHECK-NEXT: uaddl v6.8h, v6.8b, v17.8b
; CHECK-NEXT: uaddl v5.8h, v16.8b, v5.8b
; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
; CHECK-NEXT: ushll v3.4s, v0.4h, #3
; CHECK-NEXT: ushll2 v16.4s, v0.8h, #3
; CHECK-NEXT: mov v4.s[3], v1.s[0]
; CHECK-NEXT: ushll v7.4s, v5.4h, #3
; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3
; CHECK-NEXT: uaddw v0.4s, v3.4s, v6.4h
; CHECK-NEXT: uaddw2 v1.4s, v16.4s, v6.8h
; CHECK-NEXT: str q4, [x4]
; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v2.8h
; CHECK-NEXT: uaddw v2.4s, v7.4s, v2.4h
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
%lp2 = load <4 x i8>, ptr %p2
%p3 = getelementptr i8, ptr %p, i32 8
%lp3 = load <4 x i8>, ptr %p3
%p4 = getelementptr i8, ptr %p, i32 12
%lp4 = load <4 x i8>, ptr %p4
%lq1 = load <4 x i8>, ptr %q
%q2 = getelementptr i8, ptr %q, i32 4
%lq2 = load <4 x i8>, ptr %q2
%q3 = getelementptr i8, ptr %q, i32 8
%lq3 = load <4 x i8>, ptr %q3
%q4 = getelementptr i8, ptr %q, i32 12
%lq4 = load <4 x i8>, ptr %q4
%lr1 = load <4 x i8>, ptr %r
%r2 = getelementptr i8, ptr %r, i32 4
%lr2 = load <4 x i8>, ptr %r2
%r3 = getelementptr i8, ptr %r, i32 8
%lr3 = load <4 x i8>, ptr %r3
%r4 = getelementptr i8, ptr %r, i32 12
%lr4 = load <4 x i8>, ptr %r4
%ls1 = load <4 x i8>, ptr %s
%s2 = getelementptr i8, ptr %s, i32 4
%ls2 = load <4 x i8>, ptr %s2
%s3 = getelementptr i8, ptr %s, i32 8
%ls3 = load <4 x i8>, ptr %s3
%s4 = getelementptr i8, ptr %s, i32 12
%ls4 = load <4 x i8>, ptr %s4
%jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
store <16 x i8> %l4, ptr %z
%le11 = zext <16 x i8> %l1 to <16 x i16>
%le12 = zext <16 x i8> %l3 to <16 x i16>
%le21 = zext <16 x i8> %l2 to <16 x i16>
%le22 = zext <16 x i8> %l4 to <16 x i16>
%la1 = add <16 x i16> %le11, %le12
%la2 = add <16 x i16> %le21, %le22
%e1 = zext <16 x i16> %la1 to <16 x i32>
%e2 = zext <16 x i16> %la2 to <16 x i32>
%se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <16 x i32> %e1, %se2
ret <16 x i32> %a
}
define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-LABEL: extrause_ext:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s5, [x2]
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x9, x3, #12
; CHECK-NEXT: add x10, x1, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4
; CHECK-NEXT: ldp s1, s2, [x0]
; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v2.s }[1], [x1]
; CHECK-NEXT: ldp s7, s4, [x0, #8]
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
; CHECK-NEXT: ldp s6, s3, [x2, #8]
; CHECK-NEXT: ld1 { v4.s }[1], [x11]
; CHECK-NEXT: ld1 { v7.s }[1], [x10]
; CHECK-NEXT: ld1 { v3.s }[1], [x9]
; CHECK-NEXT: ld1 { v6.s }[1], [x8]
; CHECK-NEXT: uaddl v2.8h, v2.8b, v4.8b
; CHECK-NEXT: uaddl v1.8h, v1.8b, v7.8b
; CHECK-NEXT: ushll v4.8h, v4.8b, #0
; CHECK-NEXT: uaddl v5.8h, v5.8b, v3.8b
; CHECK-NEXT: uaddl v6.8h, v0.8b, v6.8b
; CHECK-NEXT: ushll v16.8h, v3.8b, #0
; CHECK-NEXT: ushll v0.4s, v2.4h, #3
; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
; CHECK-NEXT: ushll v7.4s, v5.4h, #3
; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3
; CHECK-NEXT: stp q4, q16, [x4]
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v6.8h
; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
%lp2 = load <4 x i8>, ptr %p2
%p3 = getelementptr i8, ptr %p, i32 8
%lp3 = load <4 x i8>, ptr %p3
%p4 = getelementptr i8, ptr %p, i32 12
%lp4 = load <4 x i8>, ptr %p4
%lq1 = load <4 x i8>, ptr %q
%q2 = getelementptr i8, ptr %q, i32 4
%lq2 = load <4 x i8>, ptr %q2
%q3 = getelementptr i8, ptr %q, i32 8
%lq3 = load <4 x i8>, ptr %q3
%q4 = getelementptr i8, ptr %q, i32 12
%lq4 = load <4 x i8>, ptr %q4
%lr1 = load <4 x i8>, ptr %r
%r2 = getelementptr i8, ptr %r, i32 4
%lr2 = load <4 x i8>, ptr %r2
%r3 = getelementptr i8, ptr %r, i32 8
%lr3 = load <4 x i8>, ptr %r3
%r4 = getelementptr i8, ptr %r, i32 12
%lr4 = load <4 x i8>, ptr %r4
%ls1 = load <4 x i8>, ptr %s
%s2 = getelementptr i8, ptr %s, i32 4
%ls2 = load <4 x i8>, ptr %s2
%s3 = getelementptr i8, ptr %s, i32 8
%ls3 = load <4 x i8>, ptr %s3
%s4 = getelementptr i8, ptr %s, i32 12
%ls4 = load <4 x i8>, ptr %s4
%jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%le11 = zext <16 x i8> %l1 to <16 x i16>
%le12 = zext <16 x i8> %l3 to <16 x i16>
%le21 = zext <16 x i8> %l2 to <16 x i16>
%le22 = zext <16 x i8> %l4 to <16 x i16>
store <16 x i16> %le22, ptr %z
%la1 = add <16 x i16> %le11, %le12
%la2 = add <16 x i16> %le21, %le22
%e1 = zext <16 x i16> %la1 to <16 x i32>
%e2 = zext <16 x i16> %la2 to <16 x i32>
%se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <16 x i32> %e1, %se2
ret <16 x i32> %a
}
define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-LABEL: extrause_add:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s5, [x2]
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x9, x3, #12
; CHECK-NEXT: add x10, x1, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4
; CHECK-NEXT: ldp s1, s2, [x0]
; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v2.s }[1], [x1]
; CHECK-NEXT: ldp s7, s3, [x0, #8]
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
; CHECK-NEXT: ldp s6, s4, [x2, #8]
; CHECK-NEXT: ld1 { v3.s }[1], [x11]
; CHECK-NEXT: ld1 { v7.s }[1], [x10]
; CHECK-NEXT: ld1 { v4.s }[1], [x9]
; CHECK-NEXT: ld1 { v6.s }[1], [x8]
; CHECK-NEXT: uaddl v16.8h, v2.8b, v3.8b
; CHECK-NEXT: uaddl v1.8h, v1.8b, v7.8b
; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b
; CHECK-NEXT: uaddl v2.8h, v0.8b, v6.8b
; CHECK-NEXT: ushll v0.4s, v16.4h, #3
; CHECK-NEXT: ushll2 v6.4s, v16.8h, #3
; CHECK-NEXT: ushll v5.4s, v4.4h, #3
; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3
; CHECK-NEXT: stp q16, q4, [x4]
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v1.8h
; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h
; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
%lp2 = load <4 x i8>, ptr %p2
%p3 = getelementptr i8, ptr %p, i32 8
%lp3 = load <4 x i8>, ptr %p3
%p4 = getelementptr i8, ptr %p, i32 12
%lp4 = load <4 x i8>, ptr %p4
%lq1 = load <4 x i8>, ptr %q
%q2 = getelementptr i8, ptr %q, i32 4
%lq2 = load <4 x i8>, ptr %q2
%q3 = getelementptr i8, ptr %q, i32 8
%lq3 = load <4 x i8>, ptr %q3
%q4 = getelementptr i8, ptr %q, i32 12
%lq4 = load <4 x i8>, ptr %q4
%lr1 = load <4 x i8>, ptr %r
%r2 = getelementptr i8, ptr %r, i32 4
%lr2 = load <4 x i8>, ptr %r2
%r3 = getelementptr i8, ptr %r, i32 8
%lr3 = load <4 x i8>, ptr %r3
%r4 = getelementptr i8, ptr %r, i32 12
%lr4 = load <4 x i8>, ptr %r4
%ls1 = load <4 x i8>, ptr %s
%s2 = getelementptr i8, ptr %s, i32 4
%ls2 = load <4 x i8>, ptr %s2
%s3 = getelementptr i8, ptr %s, i32 8
%ls3 = load <4 x i8>, ptr %s3
%s4 = getelementptr i8, ptr %s, i32 12
%ls4 = load <4 x i8>, ptr %s4
%jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%le11 = zext <16 x i8> %l1 to <16 x i16>
%le12 = zext <16 x i8> %l3 to <16 x i16>
%le21 = zext <16 x i8> %l2 to <16 x i16>
%le22 = zext <16 x i8> %l4 to <16 x i16>
%la1 = add <16 x i16> %le11, %le12
%la2 = add <16 x i16> %le21, %le22
store <16 x i16> %la2, ptr %z
%e1 = zext <16 x i16> %la1 to <16 x i32>
%e2 = zext <16 x i16> %la2 to <16 x i32>
%se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <16 x i32> %e1, %se2
ret <16 x i32> %a
}
define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-LABEL: extrause_ext2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s4, [x2]
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x9, x3, #12
; CHECK-NEXT: add x10, x1, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4
; CHECK-NEXT: ldp s1, s2, [x0]
; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v2.s }[1], [x1]
; CHECK-NEXT: ldp s6, s3, [x0, #8]
; CHECK-NEXT: ld1 { v4.s }[1], [x3]
; CHECK-NEXT: ldp s7, s5, [x2, #8]
; CHECK-NEXT: ld1 { v3.s }[1], [x11]
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
; CHECK-NEXT: uaddl v4.8h, v0.8b, v7.8b
; CHECK-NEXT: ushll2 v0.4s, v2.8h, #0
; CHECK-NEXT: ushll v5.4s, v2.4h, #3
; CHECK-NEXT: ushll2 v16.4s, v2.8h, #3
; CHECK-NEXT: ushll v6.4s, v3.4h, #3
; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
; CHECK-NEXT: ushll v17.4s, v2.4h, #0
; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0
; CHECK-NEXT: ushll v19.4s, v3.4h, #0
; CHECK-NEXT: stp q17, q0, [x4]
; CHECK-NEXT: uaddw v0.4s, v5.4s, v1.4h
; CHECK-NEXT: uaddw2 v1.4s, v16.4s, v1.8h
; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v4.8h
; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h
; CHECK-NEXT: stp q19, q18, [x4, #32]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
%lp2 = load <4 x i8>, ptr %p2
%p3 = getelementptr i8, ptr %p, i32 8
%lp3 = load <4 x i8>, ptr %p3
%p4 = getelementptr i8, ptr %p, i32 12
%lp4 = load <4 x i8>, ptr %p4
%lq1 = load <4 x i8>, ptr %q
%q2 = getelementptr i8, ptr %q, i32 4
%lq2 = load <4 x i8>, ptr %q2
%q3 = getelementptr i8, ptr %q, i32 8
%lq3 = load <4 x i8>, ptr %q3
%q4 = getelementptr i8, ptr %q, i32 12
%lq4 = load <4 x i8>, ptr %q4
%lr1 = load <4 x i8>, ptr %r
%r2 = getelementptr i8, ptr %r, i32 4
%lr2 = load <4 x i8>, ptr %r2
%r3 = getelementptr i8, ptr %r, i32 8
%lr3 = load <4 x i8>, ptr %r3
%r4 = getelementptr i8, ptr %r, i32 12
%lr4 = load <4 x i8>, ptr %r4
%ls1 = load <4 x i8>, ptr %s
%s2 = getelementptr i8, ptr %s, i32 4
%ls2 = load <4 x i8>, ptr %s2
%s3 = getelementptr i8, ptr %s, i32 8
%ls3 = load <4 x i8>, ptr %s3
%s4 = getelementptr i8, ptr %s, i32 12
%ls4 = load <4 x i8>, ptr %s4
%jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%le11 = zext <16 x i8> %l1 to <16 x i16>
%le12 = zext <16 x i8> %l3 to <16 x i16>
%le21 = zext <16 x i8> %l2 to <16 x i16>
%le22 = zext <16 x i8> %l4 to <16 x i16>
%la1 = add <16 x i16> %le11, %le12
%la2 = add <16 x i16> %le21, %le22
%e1 = zext <16 x i16> %la1 to <16 x i32>
%e2 = zext <16 x i16> %la2 to <16 x i32>
store <16 x i32> %e2, ptr %z
%se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <16 x i32> %e1, %se2
ret <16 x i32> %a
}
define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-LABEL: extrause_shl:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp s0, s4, [x2]
; CHECK-NEXT: add x8, x3, #8
; CHECK-NEXT: add x9, x3, #12
; CHECK-NEXT: add x10, x1, #8
; CHECK-NEXT: add x11, x1, #12
; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4
; CHECK-NEXT: ldp s1, s2, [x0]
; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4
; CHECK-NEXT: ld1 { v2.s }[1], [x1]
; CHECK-NEXT: ldp s6, s3, [x0, #8]
; CHECK-NEXT: ld1 { v4.s }[1], [x3]
; CHECK-NEXT: ldp s7, s5, [x2, #8]
; CHECK-NEXT: ld1 { v3.s }[1], [x11]
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
; CHECK-NEXT: uaddl v5.8h, v0.8b, v7.8b
; CHECK-NEXT: ushll v4.4s, v2.4h, #3
; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
; CHECK-NEXT: ushll v6.4s, v3.4h, #3
; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
; CHECK-NEXT: uaddw v0.4s, v4.4s, v1.4h
; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
; CHECK-NEXT: str q4, [x4]
; CHECK-NEXT: stp q2, q6, [x4, #16]
; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v5.8h
; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h
; CHECK-NEXT: str q7, [x4, #48]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
%lp2 = load <4 x i8>, ptr %p2
%p3 = getelementptr i8, ptr %p, i32 8
%lp3 = load <4 x i8>, ptr %p3
%p4 = getelementptr i8, ptr %p, i32 12
%lp4 = load <4 x i8>, ptr %p4
%lq1 = load <4 x i8>, ptr %q
%q2 = getelementptr i8, ptr %q, i32 4
%lq2 = load <4 x i8>, ptr %q2
%q3 = getelementptr i8, ptr %q, i32 8
%lq3 = load <4 x i8>, ptr %q3
%q4 = getelementptr i8, ptr %q, i32 12
%lq4 = load <4 x i8>, ptr %q4
%lr1 = load <4 x i8>, ptr %r
%r2 = getelementptr i8, ptr %r, i32 4
%lr2 = load <4 x i8>, ptr %r2
%r3 = getelementptr i8, ptr %r, i32 8
%lr3 = load <4 x i8>, ptr %r3
%r4 = getelementptr i8, ptr %r, i32 12
%lr4 = load <4 x i8>, ptr %r4
%ls1 = load <4 x i8>, ptr %s
%s2 = getelementptr i8, ptr %s, i32 4
%ls2 = load <4 x i8>, ptr %s2
%s3 = getelementptr i8, ptr %s, i32 8
%ls3 = load <4 x i8>, ptr %s3
%s4 = getelementptr i8, ptr %s, i32 12
%ls4 = load <4 x i8>, ptr %s4
%jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
%n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
%l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
%le11 = zext <16 x i8> %l1 to <16 x i16>
%le12 = zext <16 x i8> %l3 to <16 x i16>
%le21 = zext <16 x i8> %l2 to <16 x i16>
%le22 = zext <16 x i8> %l4 to <16 x i16>
%la1 = add <16 x i16> %le11, %le12
%la2 = add <16 x i16> %le21, %le22
%e1 = zext <16 x i16> %la1 to <16 x i32>
%e2 = zext <16 x i16> %la2 to <16 x i32>
%se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
store <16 x i32> %se2, ptr %z
%a = add <16 x i32> %e1, %se2
ret <16 x i32> %a
}
define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) {
; CHECK-LABEL: commuted_loads:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: add v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3
; CHECK-NEXT: ushll v3.4s, v1.4h, #3
; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h
; CHECK-NEXT: uaddw v0.4s, v3.4s, v0.4h
; CHECK-NEXT: ret
%l11 = load <8 x i8>, ptr %p1
%q1 = getelementptr i8, ptr %p1, i32 8
%l12 = load <8 x i8>, ptr %q1
%l21 = load <8 x i8>, ptr %p2
%q2 = getelementptr i8, ptr %p2, i32 8
%l22 = load <8 x i8>, ptr %q2
%l1 = add <8 x i8> %l21, %l11
%l2 = add <8 x i8> %l22, %l12
%e1 = zext <8 x i8> %l1 to <8 x i32>
%e2 = zext <8 x i8> %l2 to <8 x i32>
%se2 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <8 x i32> %e1, %se2
ret <8 x i32> %a
}
define <8 x i32> @commuted_loads2(ptr %p1, ptr %p2) {
; CHECK-LABEL: commuted_loads2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp d0, d3, [x1]
; CHECK-NEXT: ldp d1, d2, [x0]
; CHECK-NEXT: add v0.8b, v1.8b, v0.8b
; CHECK-NEXT: add v1.8b, v2.8b, v3.8b
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll v2.8h, v1.8b, #0
; CHECK-NEXT: ushll v3.4s, v0.4h, #3
; CHECK-NEXT: ushll2 v0.4s, v0.8h, #3
; CHECK-NEXT: uaddw2 v1.4s, v0.4s, v2.8h
; CHECK-NEXT: uaddw v0.4s, v3.4s, v2.4h
; CHECK-NEXT: ret
%l11 = load <8 x i8>, ptr %p1
%q1 = getelementptr i8, ptr %p1, i32 8
%l12 = load <8 x i8>, ptr %q1
%l21 = load <8 x i8>, ptr %p2
%q2 = getelementptr i8, ptr %p2, i32 8
%l22 = load <8 x i8>, ptr %q2
%l1 = add <8 x i8> %l11, %l21
%l2 = add <8 x i8> %l12, %l22
%e1 = zext <8 x i8> %l2 to <8 x i32>
%e2 = zext <8 x i8> %l1 to <8 x i32>
%se2 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = add <8 x i32> %e1, %se2
ret <8 x i32> %a
}
define <8 x i32> @commuted_sub(ptr %p1, ptr %p2) {
; CHECK-LABEL: commuted_sub:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp d2, d1, [x1]
; CHECK-NEXT: ldr d0, [x0, #8]
; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: add v1.8b, v1.8b, v2.8b
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll v2.8h, v1.8b, #0
; CHECK-NEXT: ushll v3.4s, v0.4h, #3
; CHECK-NEXT: ushll2 v0.4s, v0.8h, #3
; CHECK-NEXT: usubw2 v1.4s, v0.4s, v2.8h
; CHECK-NEXT: usubw v0.4s, v3.4s, v2.4h
; CHECK-NEXT: ret
%l11 = load <8 x i8>, ptr %p1
%q1 = getelementptr i8, ptr %p1, i32 8
%l12 = load <8 x i8>, ptr %q1
%l21 = load <8 x i8>, ptr %p2
%q2 = getelementptr i8, ptr %p2, i32 8
%l22 = load <8 x i8>, ptr %q2
%l1 = add <8 x i8> %l11, %l21
%l2 = add <8 x i8> %l12, %l22
%e1 = zext <8 x i8> %l1 to <8 x i32>
%e2 = zext <8 x i8> %l2 to <8 x i32>
%se2 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%a = sub <8 x i32> %se2, %e1
ret <8 x i32> %a
}
define <4 x i32> @bitcast(ptr %p) {
; CHECK-LABEL: bitcast:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
; CHECK-NEXT: ret
%l1b = load float, ptr %p
%l1 = bitcast float %l1b to <4 x i8>
%q = getelementptr i8, ptr %p, i32 4
%l2b = load float, ptr %q
%l2 = bitcast float %l2b to <4 x i8>
%e1 = zext <4 x i8> %l1 to <4 x i32>
%e2 = zext <4 x i8> %l2 to <4 x i32>
%e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
%a = add <4 x i32> %e1, %e3
ret <4 x i32> %a
}
define <4 x i32> @atomic(ptr %p) {
; CHECK-LABEL: atomic:
; CHECK: // %bb.0:
; CHECK-NEXT: ldar w8, [x0]
; CHECK-NEXT: movi v0.2d, #0x0000ff000000ff
; CHECK-NEXT: ldr s1, [x0, #4]
; CHECK-NEXT: fmov s2, w8
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: zip1 v2.8b, v2.8b, v0.8b
; CHECK-NEXT: ushll v1.4s, v1.4h, #3
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%l1b = load atomic float, ptr %p acquire, align 4
%l1 = bitcast float %l1b to <4 x i8>
%q = getelementptr i8, ptr %p, i32 4
%l2b = load float, ptr %q
%l2 = bitcast float %l2b to <4 x i8>
%e1 = zext <4 x i8> %l1 to <4 x i32>
%e2 = zext <4 x i8> %l2 to <4 x i32>
%e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
%a = add <4 x i32> %e1, %e3
ret <4 x i32> %a
}
define <4 x i32> @volatile(ptr %p) {
; CHECK-LABEL: volatile:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr s1, [x0, #4]
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ushll v1.4s, v1.4h, #3
; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%l1b = load volatile float, ptr %p
%l1 = bitcast float %l1b to <4 x i8>
%q = getelementptr i8, ptr %p, i32 4
%l2b = load float, ptr %q
%l2 = bitcast float %l2b to <4 x i8>
%e1 = zext <4 x i8> %l1 to <4 x i32>
%e2 = zext <4 x i8> %l2 to <4 x i32>
%e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
%a = add <4 x i32> %e1, %e3
ret <4 x i32> %a
}