
This reapplies 067caaa and 382a085 (reverting b35f6e2) with fixes to issues detected by the address sanitizer (MIs have to be removed from live intervals before being removed from their parent MBB). Original commit description below. AMDGPU scheduler's `PreRARematStage` attempts to increase function occupancy w.r.t. ArchVGPR usage by rematerializing trivial ArchVGPR-defining instruction next to their single use. It first collects all eligible trivially rematerializable instructions in the function, then sinks them one-by-one while recomputing occupancy in all affected regions each time to determine if and when it has managed to increase overall occupancy. If it does, changes are committed to the scheduler's state; otherwise modifications to the IR are reverted and the scheduling stage gives up. In both cases, this scheduling stage currently involves repeated queries for up-to-date occupancy estimates and some state copying to enable reversal of sinking decisions when occupancy is revealed not to increase. The current implementation also does not accurately track register pressure changes in all regions affected by sinking decisions. This commit refactors this scheduling stage, improving RP tracking and splitting the stage into two distinct steps to avoid repeated occupancy queries and IR/state rollbacks. - Analysis and collection (`canIncreaseOccupancyOrReduceSpill`). The number of ArchVGPRs to save to reduce spilling or increase function occupancy by 1 (when there is no spilling) is computed. Then, instructions eligible for rematerialization are collected, stopping as soon as enough have been identified to be able to achieve our goal (according to slightly optimistic heuristics). If there aren't enough of such instructions, the scheduling stage stops here. - Rematerialization (`rematerialize`). Instructions collected in the first step are rematerialized one-by-one. Now we are able to directly update the scheduler's state since we have already done the occupancy analysis and know we won't have to rollback any state. Register pressures for impacted regions are recomputed only once, as opposed to at every sinking decision. In the case where the stage attempted to increase occupancy, and if both rematerializations alone and rescheduling after were unable to improve occupancy, then all rematerializations are rollbacked.
157 lines
8.2 KiB
YAML
157 lines
8.2 KiB
YAML
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
|
|
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s
|
|
# REQUIRES: asserts
|
|
|
|
--- |
|
|
define void @sink_and_inc_idx_when_skipping_small_region_1() "amdgpu-flat-work-group-size"="1,64" {
|
|
ret void
|
|
}
|
|
|
|
define void @sink_and_inc_idx_when_skipping_small_regions_2() "amdgpu-flat-work-group-size"="1,64" {
|
|
ret void
|
|
}
|
|
---
|
|
name: sink_and_inc_idx_when_skipping_small_region_1
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: true
|
|
body: |
|
|
; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
|
|
; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
|
|
; DEBUG-NEXT: ********** MI Scheduling **********
|
|
; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
|
|
; DEBUG-NEXT: From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
|
|
; DEBUG-NEXT: To: End RegionInstrs: 2
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
%0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
|
|
%1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
|
|
%2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
|
|
%3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
|
|
%4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
|
|
%5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
|
|
%6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
|
|
%7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
|
|
%8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
|
|
%9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
|
|
%10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
|
|
%11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
|
|
%12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
|
|
%13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
|
|
%14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
|
|
%15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
|
|
%16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
|
|
%17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
|
|
%18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
|
|
%19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
|
|
%20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
|
|
%21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
|
|
%22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
|
|
|
|
bb.1:
|
|
; predecessors: %bb.0
|
|
successors: %bb.2
|
|
|
|
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
|
|
S_NOP 0
|
|
|
|
|
|
bb.2:
|
|
; predecessors: %bb.1
|
|
successors: %bb.3
|
|
|
|
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
|
|
S_NOP 0, implicit %24
|
|
|
|
bb.3:
|
|
; predcessors: %bb.2
|
|
|
|
S_NOP 0, implicit %23
|
|
S_NOP 0, implicit %0, implicit %1
|
|
S_NOP 0, implicit %2, implicit %3
|
|
S_NOP 0, implicit %4, implicit %5
|
|
S_NOP 0, implicit %6, implicit %7
|
|
S_NOP 0, implicit %8, implicit %9
|
|
S_NOP 0, implicit %10, implicit %11
|
|
S_NOP 0, implicit %12, implicit %13
|
|
S_NOP 0, implicit %14, implicit %15
|
|
S_NOP 0, implicit %16, implicit %17
|
|
S_NOP 0, implicit %18, implicit %19
|
|
S_NOP 0, implicit %20, implicit %21
|
|
S_NOP 0, implicit %22
|
|
S_ENDPGM 0
|
|
...
|
|
---
|
|
name: sink_and_inc_idx_when_skipping_small_regions_2
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: true
|
|
body: |
|
|
; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
|
|
; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
|
|
; DEBUG-NEXT: ********** MI Scheduling **********
|
|
; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
|
|
; DEBUG-NEXT: From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
|
|
; DEBUG-NEXT: To: End RegionInstrs: 4
|
|
bb.0:
|
|
successors: %bb.1
|
|
|
|
%0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
|
|
%1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
|
|
%2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
|
|
%3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
|
|
%4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
|
|
%5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
|
|
%6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
|
|
%7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
|
|
%8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
|
|
%9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
|
|
%10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
|
|
%11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
|
|
%12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
|
|
%13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
|
|
%14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
|
|
%15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
|
|
%16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
|
|
%17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
|
|
%18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
|
|
%19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
|
|
%20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
|
|
%21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
|
|
%22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
|
|
|
|
bb.1:
|
|
; predecessors: %bb.0
|
|
successors: %bb.2
|
|
|
|
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
|
|
S_NOP 0
|
|
|
|
|
|
bb.2:
|
|
; predecessors: %bb.1
|
|
successors: %bb.3
|
|
|
|
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
|
|
S_NOP 0, implicit %24
|
|
S_NOP 0, implicit %23
|
|
|
|
bb.3:
|
|
; predcessors: %bb.2
|
|
|
|
S_NOP 0, implicit %0, implicit %1
|
|
S_NOP 0, implicit %2, implicit %3
|
|
S_NOP 0, implicit %4, implicit %5
|
|
S_NOP 0, implicit %6, implicit %7
|
|
S_NOP 0, implicit %8, implicit %9
|
|
S_NOP 0, implicit %10, implicit %11
|
|
S_NOP 0, implicit %12, implicit %13
|
|
S_NOP 0, implicit %14, implicit %15
|
|
S_NOP 0, implicit %16, implicit %17
|
|
S_NOP 0, implicit %18, implicit %19
|
|
S_NOP 0, implicit %20, implicit %21
|
|
S_NOP 0, implicit %22
|
|
S_ENDPGM 0
|
|
...
|