llvm-project/lld/test/MachO/cfstring-dedup.s
Jez Ng 8ec1033933 [lld-macho] Deduplicate CFStrings during ICF
`__cfstring` has embedded addends that foil ICF's hashing / equality
checks. (We can ignore embedded addends when doing ICF because the same
information gets recorded in our Reloc structs.) Therefore, in order to
properly dedup CFStrings, we create a mutable copy of the CFString and
zero out the embedded addends before performing any hashing / equality
checks.

(We did in fact have a partial implementation of CFString deduplication
already. However, it only worked when the cstrings they point to are at
identical offsets in their object files.)

I anticipate this approach can be extended to other similar
statically-allocated struct sections in the future.

In addition, we previously treated all references with differing addends
as unequal. This is not true when the references are to literals:
different addends may point to the same literal in the output binary. In
particular, `__cfstring` has such references to `__cstring`. I've
adjusted ICF's `equalsConstant` logic accordingly, and I've added a few
more tests to make sure the addend-comparison code path is adequately
covered.

Fixes https://github.com/llvm/llvm-project/issues/51281.

Reviewed By: #lld-macho, Roger

Differential Revision: https://reviews.llvm.org/D120137
2022-03-08 08:34:03 -05:00

151 lines
4.2 KiB
ArmAsm

# REQUIRES: x86
# RUN: rm -rf %t; split-file %s %t
# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo1.s -o %t/foo1.o
# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo2.s -o %t/foo2.o
# RUN: %lld -dylib --icf=all -framework CoreFoundation %t/foo1.o %t/foo2.o -o %t/foo
# RUN: llvm-objdump --macho --rebase --bind --syms -d %t/foo | FileCheck %s
# CHECK: (__TEXT,__text) section
# CHECK-NEXT: _foo1:
# CHECK-NEXT: _foo2:
# CHECK-NEXT: movq _named_cfstring(%rip), %rax
# CHECK-NEXT: _foo1_utf16:
# CHECK-NEXT: movq [[#]](%rip), %rax
# CHECK-NEXT: _named_foo1:
# CHECK-NEXT: _named_foo2:
# CHECK-NEXT: movq _named_cfstring(%rip), %rax
# CHECK-NEXT: _foo2_utf16:
# CHECK-NEXT: movq [[#]](%rip), %rax
# CHECK: SYMBOL TABLE:
# CHECK-DAG: [[#%.16x,FOO:]] g F __TEXT,__text _foo1
# CHECK-DAG: [[#FOO]] g F __TEXT,__text _foo2
## Make sure we don't emit redundant bind / rebase opcodes for folded sections.
# CHECK: Rebase table:
# CHECK-NEXT: segment section address type
# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer
# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer
# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer
# CHECK-EMPTY:
# CHECK-NEXT: Bind table:
# CHECK-NEXT: segment section address type addend dylib symbol
# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer 0 CoreFoundation ___CFConstantStringClassReference
# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer 0 CoreFoundation ___CFConstantStringClassReference
# CHECK-NEXT: __DATA_CONST __cfstring {{.*}} pointer 0 CoreFoundation ___CFConstantStringClassReference
# CHECK-EMPTY:
#--- foo1.s
.cstring
L_.str.0:
.asciz "bar"
## This string is at a different offset than the corresponding "foo" string in
## foo2.s. Make sure that we treat references to either string as equivalent.
L_.str:
.asciz "foo"
.section __DATA,__cfstring
.p2align 3
L__unnamed_cfstring_:
.quad ___CFConstantStringClassReference
.long 1992 ## utf-8
.space 4
.quad L_.str
.quad 3 ## strlen
_named_cfstring:
.quad ___CFConstantStringClassReference
.long 1992 ## utf-8
.space 4
.quad L_.str
.quad 3 ## strlen
.section __TEXT,__ustring
l_.ustr:
.short 102 ## f
.short 111 ## o
.short 0 ## \0
.short 111 ## o
.short 0 ## \0
## FIXME: We should be able to deduplicate UTF-16 CFStrings too.
## Note that this string contains a null byte in the middle -- any dedup code
## we add should take care to handle this correctly.
## Technically, UTF-8 should support encoding null bytes too, but since we
## atomize the __cstring section at every null byte, this isn't supported. ld64
## doesn't support it either, and clang seems to always emit a UTF-16 CFString
## if it needs to contain a null, so I think we're good here.
.section __DATA,__cfstring
.p2align 3
L__unnamed_cfstring_.2:
.quad ___CFConstantStringClassReference
.long 2000 ## utf-16
.space 4
.quad l_.ustr
.quad 4 ## strlen
.text
.globl _foo1, _foo1_utf16, _named_foo1
_foo1:
movq L__unnamed_cfstring_(%rip), %rax
_foo1_utf16:
movq L__unnamed_cfstring_.2(%rip), %rax
_named_foo1:
movq _named_cfstring(%rip), %rax
.subsections_via_symbols
#--- foo2.s
.cstring
L_.str:
.asciz "foo"
.section __DATA,__cfstring
.p2align 3
L__unnamed_cfstring_:
.quad ___CFConstantStringClassReference
.long 1992 ## utf-8
.space 4
.quad L_.str
.quad 3 ## strlen
_named_cfstring:
.quad ___CFConstantStringClassReference
.long 1992 ## utf-8
.space 4
.quad L_.str
.quad 3 ## strlen
.section __TEXT,__ustring
.p2align 1
l_.ustr:
.short 102 ## f
.short 111 ## o
.short 0 ## \0
.short 111 ## o
.short 0 ## \0
.section __DATA,__cfstring
.p2align 3
L__unnamed_cfstring_.2:
.quad ___CFConstantStringClassReference
.long 2000 ## utf-16
.space 4
.quad l_.ustr
.quad 4 ## strlen
.text
.globl _foo2, _foo2_utf16, _named_foo2
_foo2:
movq L__unnamed_cfstring_(%rip), %rax
_foo2_utf16:
movq L__unnamed_cfstring_.2(%rip), %rax
_named_foo2:
movq _named_cfstring(%rip), %rax
.subsections_via_symbols