
This MR adds support for cmpxchg instructions with syncscope. - Adds a new definition for atomic 3-operand instructions, with constant operands for sem, scope and addsp. - Lowers cmpxchg SDNodes populating sem, scope and addsp using SDNodeXForms. - Handle syncscope correctly for emulation loops in AtomicExpand, in bracketInstructionWithFences. - Modifies emitLeadingFence, emitTrailingFence to accept SyncScope as a parameter. Modifies implementation of these in other backends, with the parameter being ignored. - Tests for a _slice_ of all possible combinations of the cmpxchg instruction (with modifications to cmpxchg.py) --------- Co-authored-by: gonzalobg <65027571+gonzalobg@users.noreply.github.com>
134 lines
5.2 KiB
Python
134 lines
5.2 KiB
Python
# For manual usage, not as a part of lit tests. Used for generating the following tests:
|
|
# cmpxchg-sm30.ll, cmpxchg-sm70.ll, cmpxchg-sm90.ll
|
|
|
|
from string import Template
|
|
from itertools import product
|
|
|
|
cmpxchg_func = Template(
|
|
"""define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
|
|
%pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure
|
|
ret i$size %new
|
|
}
|
|
"""
|
|
)
|
|
|
|
cmpxchg_func_no_scope = Template(
|
|
"""define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
|
|
%pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
|
|
ret i$size %new
|
|
}
|
|
"""
|
|
)
|
|
|
|
run_statement = Template(
|
|
"""; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
|
|
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
|
|
"""
|
|
)
|
|
|
|
|
|
def get_addrspace_cast(addrspace):
|
|
if addrspace == 0:
|
|
return ""
|
|
else:
|
|
return " addrspace({})".format(str(addrspace))
|
|
|
|
|
|
TESTS = [(60, 50), (70, 63), (90, 87)]
|
|
|
|
LLVM_SCOPES = ["", "block", "cluster", "device"]
|
|
|
|
SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"}
|
|
|
|
SUCCESS_ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"]
|
|
|
|
FAILURE_ORDERINGS = ["monotonic", "acquire", "seq_cst"]
|
|
|
|
SIZES = [8, 16, 32, 64]
|
|
|
|
ADDRSPACES = [0, 1, 3]
|
|
|
|
ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
for sm, ptx in TESTS:
|
|
with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
|
|
print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
|
|
|
|
# Our test space is: SIZES X SUCCESS_ORDERINGS X FAILURE_ORDERINGS X ADDRSPACES X LLVM_SCOPES
|
|
# This is very large, so we instead test 3 slices.
|
|
|
|
# First slice: are all orderings correctly supported, with and without emulation loops?
|
|
# set addrspace to global, scope to cta, generate all possible orderings, for all operation sizes
|
|
addrspace, llvm_scope = 1, "block"
|
|
for size, success, failure in product(
|
|
SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS
|
|
):
|
|
print(
|
|
cmpxchg_func.substitute(
|
|
success=success,
|
|
failure=failure,
|
|
size=size,
|
|
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
|
|
addrspace_cast=get_addrspace_cast(addrspace),
|
|
llvm_scope=llvm_scope,
|
|
ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
|
|
),
|
|
file=fp,
|
|
)
|
|
|
|
# Second slice: Are all scopes correctlly supported, with and without emulation loops?
|
|
# fix addrspace, ordering, generate all possible scopes, for operation sizes i8, i32
|
|
addrspace, success, failure = 1, "acq_rel", "acquire"
|
|
for size in [8, 32]:
|
|
print(
|
|
cmpxchg_func_no_scope.substitute(
|
|
success=success,
|
|
failure=failure,
|
|
size=size,
|
|
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
|
|
addrspace_cast=get_addrspace_cast(addrspace),
|
|
),
|
|
file=fp,
|
|
)
|
|
|
|
for llvm_scope in LLVM_SCOPES:
|
|
if sm < 90 and llvm_scope == "cluster":
|
|
continue
|
|
if llvm_scope == "block":
|
|
# skip (acq_rel, acquire, global, cta)
|
|
continue
|
|
print(
|
|
cmpxchg_func.substitute(
|
|
success=success,
|
|
failure=failure,
|
|
size=size,
|
|
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
|
|
addrspace_cast=get_addrspace_cast(addrspace),
|
|
llvm_scope=llvm_scope,
|
|
ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
|
|
),
|
|
file=fp,
|
|
)
|
|
|
|
# Third slice: Are all address spaces correctly supported?
|
|
# fix ordering, scope, generate all possible address spaces, for operation sizes i8, i32
|
|
success, failure, llvm_scope = "acq_rel", "acquire", "block"
|
|
for size, addrspace in product([8, 32], ADDRSPACES):
|
|
if addrspace == 1:
|
|
# skip (acq_rel, acquire, global, cta)
|
|
continue
|
|
print(
|
|
cmpxchg_func.substitute(
|
|
success=success,
|
|
failure=failure,
|
|
size=size,
|
|
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
|
|
addrspace_cast=get_addrspace_cast(addrspace),
|
|
llvm_scope=llvm_scope,
|
|
ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
|
|
),
|
|
file=fp,
|
|
)
|