Hongtao Yu 5740bb801a [CSSPGO] Use nested context-sensitive profile.
CSSPGO currently employs a flat profile format for context-sensitive profiles. Such a flat profile allows for precisely manipulating contexts that is either inlined or not inlined. This is a benefit over the nested profile format used by non-CS AutoFDO. A downside of this is the longer build time due to parsing the indexing the full CS contexts.

For a CS flat profile, though only the context profiles relevant to a module are loaded when that module is compiled, the cost to figure out what profiles are relevant is noticeably high when there're many contexts,  since the sample reader will need to scan all context strings anyway. On the contrary, a nested function profile has its related inline subcontexts isolated from other unrelated contexts. Therefore when compiling a set of functions, unrelated contexts will never need to be scanned.

In this change we are exploring using nested profile format for CSSPGO. This is expected to work based on an assumption that with a preinliner-computed profile all contexts are precomputed and expected to be inlined by the compiler. Contexts not expected to be inlined will be cut off and returned to corresponding base profiles (for top-level outlined functions). This naturally forms a nested profile where all nested contexts are expected to be inlined. The compiler will less likely optimize on derived contexts that are not precomputed.

A CS-nested profile will look exactly the same with regular nested profile except that each nested profile can come with an attributes. With pseudo probes,  a nested profile shown as below can also have a CFG checksum.

```

main:1968679:12
 2: 24
 3: 28 _Z5funcAi:18
 3.1: 28 _Z5funcBi:30
 3: _Z5funcAi:1467398
  0: 10
  1: 10 _Z8funcLeafi:11
  3: 24
  1: _Z8funcLeafi:1467299
   0: 6
   1: 6
   3: 287884
   4: 287864 _Z3fibi:315608
   15: 23
   !CFGChecksum: 138828622701
   !Attributes: 2
  !CFGChecksum: 281479271677951
  !Attributes: 2
```

Specific work included in this change:
- A recursive profile converter to convert CS flat profile to nested profile.
- Extend function checksum and attribute metadata to be stored in nested way for text profile and extbinary profile.
- Unifiy sample loader inliner path for CS and preinlined nested profile.
 - Changes in the sample loader to support probe-based nested profile.

I've seen promising results regarding build time. A nested profile can result in a 20% shorter build time than a CS flat profile while keep an on-par performance. This is with -duplicate-contexts-into-base=1.

Test Plan:

Reviewed By: wenlei

Differential Revision: https://reviews.llvm.org/D115205
2021-12-14 14:40:25 -08:00

105 lines
5.0 KiB
LLVM

; Test we lose details of not inlined profile without '-sample-profile-merge-inlinee'
; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -use-profiled-call-graph=0 -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -use-profiled-call-graph=0 -enable-new-pm=0 -S | FileCheck -check-prefix=SCALE %s
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=false -use-profiled-call-graph=0 -S | FileCheck -check-prefix=SCALE %s
; Test we properly merge not inlined profile with '-sample-profile-merge-inlinee'
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -use-profiled-call-graph=0 -S | FileCheck -check-prefix=MERGE %s
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee=true -use-profiled-call-graph=0 -sample-profile-prioritized-inline=1 -S | FileCheck -check-prefix=MERGE %s
; Test we properly merge not inlined profile with '-sample-profile-merge-inlinee'
; when the profile uses md5.
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.md5.prof -sample-profile-merge-inlinee=true -use-profiled-call-graph=0 -S | FileCheck -check-prefix=MERGE %s
@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
define i32 @main() #0 !dbg !6 {
entry:
%retval = alloca i32, align 4
%s = alloca i32, align 4
%i = alloca i32, align 4
%tmp = load i32, i32* %i, align 4, !dbg !8
%tmp1 = load i32, i32* %s, align 4, !dbg !8
%call = call i32 @_Z3sumii(i32 %tmp, i32 %tmp1), !dbg !8
; SCALE: call i32 @_Z3sumii
; MERGE: call i32 @_Z3sumii
store i32 %call, i32* %s, align 4, !dbg !8
ret i32 0, !dbg !11
}
define i32 @_Z3sumii(i32 %x, i32 %y) #0 !dbg !12 {
entry:
%x.addr = alloca i32, align 4
%y.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
store i32 %y, i32* %y.addr, align 4
%tmp = load i32, i32* %x.addr, align 4, !dbg !13
%tmp1 = load i32, i32* %y.addr, align 4, !dbg !13
%add = add nsw i32 %tmp, %tmp1, !dbg !13
%tmp2 = load i32, i32* %x.addr, align 4, !dbg !13
%tmp3 = load i32, i32* %y.addr, align 4, !dbg !13
%cmp1 = icmp ne i32 %tmp3, 100, !dbg !13
br i1 %cmp1, label %if.then, label %if.else, !dbg !13
if.then: ; preds = %entry
%call = call i32 @_Z3subii(i32 %tmp2, i32 %tmp3), !dbg !14
ret i32 %add, !dbg !14
if.else: ; preds = %entry
ret i32 %add, !dbg !15
}
define i32 @_Z3subii(i32 %x, i32 %y) #0 !dbg !16 {
entry:
%x.addr = alloca i32, align 4
%y.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
store i32 %y, i32* %y.addr, align 4
%tmp = load i32, i32* %x.addr, align 4, !dbg !17
%tmp1 = load i32, i32* %y.addr, align 4, !dbg !17
%add = sub nsw i32 %tmp, %tmp1, !dbg !17
ret i32 %add, !dbg !18
}
attributes #0 = { "use-sample-profile" }
declare i32 @printf(i8*, ...)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
!llvm.ident = !{!5}
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "calls.cc", directory: ".")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 4}
!4 = !{i32 1, !"Debug Info Version", i32 3}
!5 = !{!"clang version 3.5 "}
!6 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
!7 = !DISubroutineType(types: !2)
!8 = !DILocation(line: 10, scope: !9)
!9 = !DILexicalBlockFile(scope: !10, file: !1, discriminator: 2)
!10 = distinct !DILexicalBlock(scope: !6, file: !1, line: 10)
!11 = !DILocation(line: 12, scope: !6)
!12 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
!13 = !DILocation(line: 4, scope: !12)
!14 = !DILocation(line: 5, scope: !12)
!15 = !DILocation(line: 6, scope: !12)
!16 = distinct !DISubprogram(name: "sub", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
!17 = !DILocation(line: 20, scope: !16)
!18 = !DILocation(line: 21, scope: !16)
; SCALE: name: "sum"
; SCALE-NEXT: {!"function_entry_count", i64 46}
; SCALE: !{!"branch_weights", i32 11, i32 2}
; SCALE: !{!"branch_weights", i32 20}
; SCALE: name: "sub"
; SCALE-NEXT: {!"function_entry_count", i64 -1}
; MERGE: name: "sum"
; MERGE-NEXT: {!"function_entry_count", i64 46}
; MERGE: !{!"branch_weights", i32 11, i32 23}
; MERGE: !{!"branch_weights", i32 10}
; MERGE: name: "sub"
; MERGE-NEXT: {!"function_entry_count", i64 3}