Nemanja Ivanovic 4ea121c904 [PowerPC] Fix a number of inefficiencies and issues with atomic code gen
There are a few issues with the code we generate for atomic operations and the way we generate it:

- Hard coded CR0 for compares
- Order of operands for compares not conducive to
  emitting compare-immediate or for CSE of compares
- Missing MachineMemOperand for st[bhwd]cx intrinsics
- Missing intrinsic properties for the same
- Unnecessary blocks with store conditional
  instructions to clear reservation (which ends
  up hindering performance)
- Move from CR instructions just to compare the
  result of a store conditional with zero (even
  though it is a record-form)

This patch aims to resolve all of those issues.

Differential revision: https://reviews.llvm.org/D134783
2022-10-03 19:55:29 -05:00

84 lines
2.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
; RUN: < %s | FileCheck --check-prefix=CHECK-64 %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-unknown \
; RUN: < %s | FileCheck --check-prefix=CHECK-32 %s
define float @test_add(float* %ptr, float %incr) {
; CHECK-64-LABEL: test_add:
; CHECK-64: # %bb.0: # %entry
; CHECK-64-NEXT: sync
; CHECK-64-NEXT: lfs 0, 0(3)
; CHECK-64-NEXT: b .LBB0_2
; CHECK-64-NEXT: .LBB0_1: # %atomicrmw.start
; CHECK-64-NEXT: #
; CHECK-64-NEXT: stw 6, -4(1)
; CHECK-64-NEXT: cmplw 6, 4
; CHECK-64-NEXT: lfs 0, -4(1)
; CHECK-64-NEXT: beq 0, .LBB0_5
; CHECK-64-NEXT: .LBB0_2: # %atomicrmw.start
; CHECK-64-NEXT: # =>This Loop Header: Depth=1
; CHECK-64-NEXT: # Child Loop BB0_3 Depth 2
; CHECK-64-NEXT: fadds 2, 0, 1
; CHECK-64-NEXT: stfs 2, -8(1)
; CHECK-64-NEXT: stfs 0, -12(1)
; CHECK-64-NEXT: lwz 5, -8(1)
; CHECK-64-NEXT: lwz 4, -12(1)
; CHECK-64-NEXT: .LBB0_3: # %atomicrmw.start
; CHECK-64-NEXT: # Parent Loop BB0_2 Depth=1
; CHECK-64-NEXT: # => This Inner Loop Header: Depth=2
; CHECK-64-NEXT: lwarx 6, 0, 3
; CHECK-64-NEXT: cmpw 6, 4
; CHECK-64-NEXT: bne 0, .LBB0_1
; CHECK-64-NEXT: # %bb.4: # %atomicrmw.start
; CHECK-64-NEXT: #
; CHECK-64-NEXT: stwcx. 5, 0, 3
; CHECK-64-NEXT: bne 0, .LBB0_3
; CHECK-64-NEXT: b .LBB0_1
; CHECK-64-NEXT: .LBB0_5: # %atomicrmw.end
; CHECK-64-NEXT: fmr 1, 0
; CHECK-64-NEXT: lwsync
; CHECK-64-NEXT: blr
;
; CHECK-32-LABEL: test_add:
; CHECK-32: # %bb.0: # %entry
; CHECK-32-NEXT: stwu 1, -32(1)
; CHECK-32-NEXT: .cfi_def_cfa_offset 32
; CHECK-32-NEXT: sync
; CHECK-32-NEXT: lfs 0, 0(3)
; CHECK-32-NEXT: b .LBB0_2
; CHECK-32-NEXT: .LBB0_1: # %atomicrmw.start
; CHECK-32-NEXT: #
; CHECK-32-NEXT: stw 6, 28(1)
; CHECK-32-NEXT: cmplw 6, 4
; CHECK-32-NEXT: lfs 0, 28(1)
; CHECK-32-NEXT: beq 0, .LBB0_5
; CHECK-32-NEXT: .LBB0_2: # %atomicrmw.start
; CHECK-32-NEXT: # =>This Loop Header: Depth=1
; CHECK-32-NEXT: # Child Loop BB0_3 Depth 2
; CHECK-32-NEXT: fadds 2, 0, 1
; CHECK-32-NEXT: stfs 2, 24(1)
; CHECK-32-NEXT: stfs 0, 20(1)
; CHECK-32-NEXT: lwz 5, 24(1)
; CHECK-32-NEXT: lwz 4, 20(1)
; CHECK-32-NEXT: .LBB0_3: # %atomicrmw.start
; CHECK-32-NEXT: # Parent Loop BB0_2 Depth=1
; CHECK-32-NEXT: # => This Inner Loop Header: Depth=2
; CHECK-32-NEXT: lwarx 6, 0, 3
; CHECK-32-NEXT: cmpw 6, 4
; CHECK-32-NEXT: bne 0, .LBB0_1
; CHECK-32-NEXT: # %bb.4: # %atomicrmw.start
; CHECK-32-NEXT: #
; CHECK-32-NEXT: stwcx. 5, 0, 3
; CHECK-32-NEXT: bne 0, .LBB0_3
; CHECK-32-NEXT: b .LBB0_1
; CHECK-32-NEXT: .LBB0_5: # %atomicrmw.end
; CHECK-32-NEXT: fmr 1, 0
; CHECK-32-NEXT: lwsync
; CHECK-32-NEXT: addi 1, 1, 32
; CHECK-32-NEXT: blr
entry:
%r = atomicrmw fadd float* %ptr, float %incr seq_cst
ret float %r
}