
Relands 7ff3a9acd84654c9ec2939f45ba27f162ae7fbc3 after regenerating the test case. Supersedes the draft PR #94992, taking a different approach following feedback: * Lower in PreISelIntrinsicLowering * Don't require that the number of bytes to set is a compile-time constant * Define llvm.memset_pattern rather than llvm.memset_pattern.inline As discussed in the [RFC thread](https://discourse.llvm.org/t/rfc-introducing-an-llvm-memset-pattern-inline-intrinsic/79496), the intent is that the intrinsic will be lowered to loops, a sequence of stores, or libcalls depending on the expected cost and availability of libcalls on the target. Right now, there's just a single lowering path that aims to handle all cases. My intent would be to follow up with additional PRs that add additional optimisations when possible (e.g. when libcalls are available, when arguments are known to be constant etc).
298 lines
10 KiB
LLVM
298 lines
10 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc < %s -mtriple=riscv32 -mattr=+m \
|
|
; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32
|
|
; RUN: llc < %s -mtriple=riscv64 -mattr=+m \
|
|
; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64
|
|
; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \
|
|
; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
|
|
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \
|
|
; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
|
|
|
|
; TODO: Due to the initial naive lowering implementation of memset.pattern in
|
|
; PreISelIntrinsicLowering, the generated code is not good.
|
|
|
|
define void @memset_1(ptr %a, i128 %value) nounwind {
|
|
; RV32-BOTH-LABEL: memset_1:
|
|
; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader
|
|
; RV32-BOTH-NEXT: li a2, 0
|
|
; RV32-BOTH-NEXT: lw a3, 0(a1)
|
|
; RV32-BOTH-NEXT: lw a4, 4(a1)
|
|
; RV32-BOTH-NEXT: lw a5, 8(a1)
|
|
; RV32-BOTH-NEXT: lw a1, 12(a1)
|
|
; RV32-BOTH-NEXT: li a6, 0
|
|
; RV32-BOTH-NEXT: .LBB0_1: # %loadstoreloop
|
|
; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV32-BOTH-NEXT: slli a7, a2, 4
|
|
; RV32-BOTH-NEXT: addi a2, a2, 1
|
|
; RV32-BOTH-NEXT: add a7, a0, a7
|
|
; RV32-BOTH-NEXT: seqz t0, a2
|
|
; RV32-BOTH-NEXT: add a6, a6, t0
|
|
; RV32-BOTH-NEXT: or t0, a2, a6
|
|
; RV32-BOTH-NEXT: sw a3, 0(a7)
|
|
; RV32-BOTH-NEXT: sw a4, 4(a7)
|
|
; RV32-BOTH-NEXT: sw a5, 8(a7)
|
|
; RV32-BOTH-NEXT: sw a1, 12(a7)
|
|
; RV32-BOTH-NEXT: beqz t0, .LBB0_1
|
|
; RV32-BOTH-NEXT: # %bb.2: # %split
|
|
; RV32-BOTH-NEXT: ret
|
|
;
|
|
; RV64-BOTH-LABEL: memset_1:
|
|
; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader
|
|
; RV64-BOTH-NEXT: addi a3, a0, 16
|
|
; RV64-BOTH-NEXT: .LBB0_1: # %loadstoreloop
|
|
; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV64-BOTH-NEXT: sd a1, 0(a0)
|
|
; RV64-BOTH-NEXT: sd a2, 8(a0)
|
|
; RV64-BOTH-NEXT: addi a0, a0, 16
|
|
; RV64-BOTH-NEXT: bne a0, a3, .LBB0_1
|
|
; RV64-BOTH-NEXT: # %bb.2: # %split
|
|
; RV64-BOTH-NEXT: ret
|
|
tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 1, i1 0)
|
|
ret void
|
|
}
|
|
|
|
define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
|
|
; RV32-LABEL: memset_1_noalign:
|
|
; RV32: # %bb.0: # %loadstoreloop.preheader
|
|
; RV32-NEXT: addi sp, sp, -32
|
|
; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill
|
|
; RV32-NEXT: li a2, 0
|
|
; RV32-NEXT: li a3, 0
|
|
; RV32-NEXT: lw a4, 4(a1)
|
|
; RV32-NEXT: lw a5, 0(a1)
|
|
; RV32-NEXT: lw a6, 8(a1)
|
|
; RV32-NEXT: lw a1, 12(a1)
|
|
; RV32-NEXT: srli a7, a4, 24
|
|
; RV32-NEXT: srli t0, a4, 16
|
|
; RV32-NEXT: srli t1, a4, 8
|
|
; RV32-NEXT: srli t2, a5, 24
|
|
; RV32-NEXT: srli t3, a5, 16
|
|
; RV32-NEXT: srli t4, a5, 8
|
|
; RV32-NEXT: srli t5, a6, 24
|
|
; RV32-NEXT: srli t6, a6, 16
|
|
; RV32-NEXT: srli s0, a6, 8
|
|
; RV32-NEXT: srli s1, a1, 24
|
|
; RV32-NEXT: srli s2, a1, 16
|
|
; RV32-NEXT: srli s3, a1, 8
|
|
; RV32-NEXT: .LBB1_1: # %loadstoreloop
|
|
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV32-NEXT: slli s4, a2, 4
|
|
; RV32-NEXT: addi a2, a2, 1
|
|
; RV32-NEXT: add s4, a0, s4
|
|
; RV32-NEXT: seqz s5, a2
|
|
; RV32-NEXT: sb a4, 4(s4)
|
|
; RV32-NEXT: sb t1, 5(s4)
|
|
; RV32-NEXT: sb t0, 6(s4)
|
|
; RV32-NEXT: sb a7, 7(s4)
|
|
; RV32-NEXT: sb a5, 0(s4)
|
|
; RV32-NEXT: sb t4, 1(s4)
|
|
; RV32-NEXT: sb t3, 2(s4)
|
|
; RV32-NEXT: sb t2, 3(s4)
|
|
; RV32-NEXT: sb a6, 8(s4)
|
|
; RV32-NEXT: sb s0, 9(s4)
|
|
; RV32-NEXT: sb t6, 10(s4)
|
|
; RV32-NEXT: sb t5, 11(s4)
|
|
; RV32-NEXT: add a3, a3, s5
|
|
; RV32-NEXT: or s5, a2, a3
|
|
; RV32-NEXT: sb a1, 12(s4)
|
|
; RV32-NEXT: sb s3, 13(s4)
|
|
; RV32-NEXT: sb s2, 14(s4)
|
|
; RV32-NEXT: sb s1, 15(s4)
|
|
; RV32-NEXT: beqz s5, .LBB1_1
|
|
; RV32-NEXT: # %bb.2: # %split
|
|
; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload
|
|
; RV32-NEXT: addi sp, sp, 32
|
|
; RV32-NEXT: ret
|
|
;
|
|
; RV64-LABEL: memset_1_noalign:
|
|
; RV64: # %bb.0: # %loadstoreloop.preheader
|
|
; RV64-NEXT: addi sp, sp, -32
|
|
; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill
|
|
; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill
|
|
; RV64-NEXT: sd s2, 8(sp) # 8-byte Folded Spill
|
|
; RV64-NEXT: addi a3, a0, 16
|
|
; RV64-NEXT: srli a4, a1, 56
|
|
; RV64-NEXT: srli a5, a1, 48
|
|
; RV64-NEXT: srli a6, a1, 40
|
|
; RV64-NEXT: srli a7, a1, 32
|
|
; RV64-NEXT: srli t0, a1, 24
|
|
; RV64-NEXT: srli t1, a1, 16
|
|
; RV64-NEXT: srli t2, a1, 8
|
|
; RV64-NEXT: srli t3, a2, 56
|
|
; RV64-NEXT: srli t4, a2, 48
|
|
; RV64-NEXT: srli t5, a2, 40
|
|
; RV64-NEXT: srli t6, a2, 32
|
|
; RV64-NEXT: srli s0, a2, 24
|
|
; RV64-NEXT: srli s1, a2, 16
|
|
; RV64-NEXT: srli s2, a2, 8
|
|
; RV64-NEXT: .LBB1_1: # %loadstoreloop
|
|
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV64-NEXT: sb a7, 4(a0)
|
|
; RV64-NEXT: sb a6, 5(a0)
|
|
; RV64-NEXT: sb a5, 6(a0)
|
|
; RV64-NEXT: sb a4, 7(a0)
|
|
; RV64-NEXT: sb a1, 0(a0)
|
|
; RV64-NEXT: sb t2, 1(a0)
|
|
; RV64-NEXT: sb t1, 2(a0)
|
|
; RV64-NEXT: sb t0, 3(a0)
|
|
; RV64-NEXT: sb t6, 12(a0)
|
|
; RV64-NEXT: sb t5, 13(a0)
|
|
; RV64-NEXT: sb t4, 14(a0)
|
|
; RV64-NEXT: sb t3, 15(a0)
|
|
; RV64-NEXT: sb a2, 8(a0)
|
|
; RV64-NEXT: sb s2, 9(a0)
|
|
; RV64-NEXT: sb s1, 10(a0)
|
|
; RV64-NEXT: sb s0, 11(a0)
|
|
; RV64-NEXT: addi a0, a0, 16
|
|
; RV64-NEXT: bne a0, a3, .LBB1_1
|
|
; RV64-NEXT: # %bb.2: # %split
|
|
; RV64-NEXT: ld s0, 24(sp) # 8-byte Folded Reload
|
|
; RV64-NEXT: ld s1, 16(sp) # 8-byte Folded Reload
|
|
; RV64-NEXT: ld s2, 8(sp) # 8-byte Folded Reload
|
|
; RV64-NEXT: addi sp, sp, 32
|
|
; RV64-NEXT: ret
|
|
;
|
|
; RV32-FAST-LABEL: memset_1_noalign:
|
|
; RV32-FAST: # %bb.0: # %loadstoreloop.preheader
|
|
; RV32-FAST-NEXT: li a2, 0
|
|
; RV32-FAST-NEXT: lw a3, 0(a1)
|
|
; RV32-FAST-NEXT: lw a4, 4(a1)
|
|
; RV32-FAST-NEXT: lw a5, 8(a1)
|
|
; RV32-FAST-NEXT: lw a1, 12(a1)
|
|
; RV32-FAST-NEXT: li a6, 0
|
|
; RV32-FAST-NEXT: .LBB1_1: # %loadstoreloop
|
|
; RV32-FAST-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV32-FAST-NEXT: slli a7, a2, 4
|
|
; RV32-FAST-NEXT: addi a2, a2, 1
|
|
; RV32-FAST-NEXT: add a7, a0, a7
|
|
; RV32-FAST-NEXT: seqz t0, a2
|
|
; RV32-FAST-NEXT: add a6, a6, t0
|
|
; RV32-FAST-NEXT: or t0, a2, a6
|
|
; RV32-FAST-NEXT: sw a3, 0(a7)
|
|
; RV32-FAST-NEXT: sw a4, 4(a7)
|
|
; RV32-FAST-NEXT: sw a5, 8(a7)
|
|
; RV32-FAST-NEXT: sw a1, 12(a7)
|
|
; RV32-FAST-NEXT: beqz t0, .LBB1_1
|
|
; RV32-FAST-NEXT: # %bb.2: # %split
|
|
; RV32-FAST-NEXT: ret
|
|
;
|
|
; RV64-FAST-LABEL: memset_1_noalign:
|
|
; RV64-FAST: # %bb.0: # %loadstoreloop.preheader
|
|
; RV64-FAST-NEXT: addi a3, a0, 16
|
|
; RV64-FAST-NEXT: .LBB1_1: # %loadstoreloop
|
|
; RV64-FAST-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV64-FAST-NEXT: sd a1, 0(a0)
|
|
; RV64-FAST-NEXT: sd a2, 8(a0)
|
|
; RV64-FAST-NEXT: addi a0, a0, 16
|
|
; RV64-FAST-NEXT: bne a0, a3, .LBB1_1
|
|
; RV64-FAST-NEXT: # %bb.2: # %split
|
|
; RV64-FAST-NEXT: ret
|
|
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0)
|
|
ret void
|
|
}
|
|
|
|
define void @memset_4(ptr %a, i128 %value) nounwind {
|
|
; RV32-BOTH-LABEL: memset_4:
|
|
; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader
|
|
; RV32-BOTH-NEXT: li a2, 0
|
|
; RV32-BOTH-NEXT: lw a3, 0(a1)
|
|
; RV32-BOTH-NEXT: lw a4, 4(a1)
|
|
; RV32-BOTH-NEXT: lw a5, 8(a1)
|
|
; RV32-BOTH-NEXT: lw a1, 12(a1)
|
|
; RV32-BOTH-NEXT: li a6, 0
|
|
; RV32-BOTH-NEXT: .LBB2_1: # %loadstoreloop
|
|
; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV32-BOTH-NEXT: slli a7, a2, 4
|
|
; RV32-BOTH-NEXT: addi a2, a2, 1
|
|
; RV32-BOTH-NEXT: seqz t0, a2
|
|
; RV32-BOTH-NEXT: sltiu t1, a2, 4
|
|
; RV32-BOTH-NEXT: add a6, a6, t0
|
|
; RV32-BOTH-NEXT: seqz t0, a6
|
|
; RV32-BOTH-NEXT: and t0, t0, t1
|
|
; RV32-BOTH-NEXT: add a7, a0, a7
|
|
; RV32-BOTH-NEXT: sw a3, 0(a7)
|
|
; RV32-BOTH-NEXT: sw a4, 4(a7)
|
|
; RV32-BOTH-NEXT: sw a5, 8(a7)
|
|
; RV32-BOTH-NEXT: sw a1, 12(a7)
|
|
; RV32-BOTH-NEXT: bnez t0, .LBB2_1
|
|
; RV32-BOTH-NEXT: # %bb.2: # %split
|
|
; RV32-BOTH-NEXT: ret
|
|
;
|
|
; RV64-BOTH-LABEL: memset_4:
|
|
; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader
|
|
; RV64-BOTH-NEXT: addi a3, a0, 64
|
|
; RV64-BOTH-NEXT: .LBB2_1: # %loadstoreloop
|
|
; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV64-BOTH-NEXT: sd a1, 0(a0)
|
|
; RV64-BOTH-NEXT: sd a2, 8(a0)
|
|
; RV64-BOTH-NEXT: addi a0, a0, 16
|
|
; RV64-BOTH-NEXT: bne a0, a3, .LBB2_1
|
|
; RV64-BOTH-NEXT: # %bb.2: # %split
|
|
; RV64-BOTH-NEXT: ret
|
|
tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 4, i1 0)
|
|
ret void
|
|
}
|
|
|
|
define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
|
|
; RV32-BOTH-LABEL: memset_x:
|
|
; RV32-BOTH: # %bb.0:
|
|
; RV32-BOTH-NEXT: or a4, a2, a3
|
|
; RV32-BOTH-NEXT: beqz a4, .LBB3_5
|
|
; RV32-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader
|
|
; RV32-BOTH-NEXT: li a4, 0
|
|
; RV32-BOTH-NEXT: lw a5, 0(a1)
|
|
; RV32-BOTH-NEXT: lw a6, 4(a1)
|
|
; RV32-BOTH-NEXT: lw a7, 8(a1)
|
|
; RV32-BOTH-NEXT: lw a1, 12(a1)
|
|
; RV32-BOTH-NEXT: li t0, 0
|
|
; RV32-BOTH-NEXT: j .LBB3_3
|
|
; RV32-BOTH-NEXT: .LBB3_2: # %loadstoreloop
|
|
; RV32-BOTH-NEXT: # in Loop: Header=BB3_3 Depth=1
|
|
; RV32-BOTH-NEXT: sltu t1, t0, a3
|
|
; RV32-BOTH-NEXT: beqz t1, .LBB3_5
|
|
; RV32-BOTH-NEXT: .LBB3_3: # %loadstoreloop
|
|
; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV32-BOTH-NEXT: slli t1, a4, 4
|
|
; RV32-BOTH-NEXT: addi a4, a4, 1
|
|
; RV32-BOTH-NEXT: seqz t2, a4
|
|
; RV32-BOTH-NEXT: add t0, t0, t2
|
|
; RV32-BOTH-NEXT: add t1, a0, t1
|
|
; RV32-BOTH-NEXT: sw a5, 0(t1)
|
|
; RV32-BOTH-NEXT: sw a6, 4(t1)
|
|
; RV32-BOTH-NEXT: sw a7, 8(t1)
|
|
; RV32-BOTH-NEXT: sw a1, 12(t1)
|
|
; RV32-BOTH-NEXT: bne t0, a3, .LBB3_2
|
|
; RV32-BOTH-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1
|
|
; RV32-BOTH-NEXT: sltu t1, a4, a2
|
|
; RV32-BOTH-NEXT: bnez t1, .LBB3_3
|
|
; RV32-BOTH-NEXT: .LBB3_5: # %split
|
|
; RV32-BOTH-NEXT: ret
|
|
;
|
|
; RV64-BOTH-LABEL: memset_x:
|
|
; RV64-BOTH: # %bb.0:
|
|
; RV64-BOTH-NEXT: beqz a3, .LBB3_3
|
|
; RV64-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader
|
|
; RV64-BOTH-NEXT: li a4, 0
|
|
; RV64-BOTH-NEXT: .LBB3_2: # %loadstoreloop
|
|
; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; RV64-BOTH-NEXT: sd a1, 0(a0)
|
|
; RV64-BOTH-NEXT: sd a2, 8(a0)
|
|
; RV64-BOTH-NEXT: addi a4, a4, 1
|
|
; RV64-BOTH-NEXT: addi a0, a0, 16
|
|
; RV64-BOTH-NEXT: bltu a4, a3, .LBB3_2
|
|
; RV64-BOTH-NEXT: .LBB3_3: # %split
|
|
; RV64-BOTH-NEXT: ret
|
|
tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 %x, i1 0)
|
|
ret void
|
|
}
|