[Offload] Add oneInterationPerThread param to loop device RTL (#151959)
Currently, Flang can generate no-loop kernels for all OpenMP target kernels in the program if the flags -fopenmp-assume-teams-oversubscription or -fopenmp-assume-threads-oversubscription are set. If we add an additional parameter, we can choose in the future which OpenMP kernels should be generated as no-loop kernels. This PR doesn't modify current behavior of oversubscription flags. RFC for no-loop kernels: https://discourse.llvm.org/t/rfc-no-loop-mode-for-openmp-gpu-kernels/87517
This commit is contained in:
parent
0977a6d9e7
commit
b69fd34e76
@ -470,18 +470,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,)
|
|||||||
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
|
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
|
||||||
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
|
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
|
||||||
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
|
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
|
||||||
__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
|
__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
|
||||||
__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32)
|
__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
|
||||||
__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
|
__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
|
||||||
__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64)
|
__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
|
__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32)
|
__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
|
__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64)
|
__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
|
__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32)
|
__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
|
__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int8)
|
||||||
__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64)
|
__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int8)
|
||||||
__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
|
__OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr)
|
||||||
__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
|
__OMP_RTL(__kmpc_kernel_end_parallel, false, Void, )
|
||||||
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
|
__OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32)
|
||||||
@ -674,22 +674,22 @@ __OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt,
|
|||||||
ParamAttrs(ReadOnlyPtrAttrs, SExt))
|
ParamAttrs(ReadOnlyPtrAttrs, SExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
|
||||||
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
||||||
SExt, SExt, SExt, SExt))
|
SExt, SExt, SExt, SExt, ZExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
|
||||||
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
||||||
ZExt, ZExt, ZExt, ZExt))
|
ZExt, ZExt, ZExt, ZExt, ZExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
|
||||||
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
||||||
SExt, SExt))
|
SExt, SExt, ZExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
|
||||||
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
||||||
ZExt, ZExt))
|
ZExt, ZExt, ZExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(),
|
||||||
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
||||||
SExt, SExt, SExt))
|
SExt, SExt, SExt, ZExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(),
|
||||||
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
|
||||||
ZExt, ZExt, ZExt))
|
ZExt, ZExt, ZExt, ZExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(),
|
||||||
ParamAttrs(AttributeSet(), SExt))
|
ParamAttrs(AttributeSet(), SExt))
|
||||||
__OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
|
__OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(),
|
||||||
|
@ -4969,6 +4969,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
|
|||||||
RealArgs.push_back(TripCount);
|
RealArgs.push_back(TripCount);
|
||||||
if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
|
if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
|
||||||
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
|
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
|
||||||
|
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
|
||||||
Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
|
Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
|
||||||
Builder.CreateCall(RTLFn, RealArgs);
|
Builder.CreateCall(RTLFn, RealArgs);
|
||||||
return;
|
return;
|
||||||
@ -4984,6 +4985,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
|
|||||||
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
|
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
|
||||||
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
|
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
|
||||||
}
|
}
|
||||||
|
RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
|
||||||
|
|
||||||
Builder.CreateCall(RTLFn, RealArgs);
|
Builder.CreateCall(RTLFn, RealArgs);
|
||||||
}
|
}
|
||||||
|
@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
|
|||||||
// CHECK-SAME: #[[ATTRS1:[0-9]+]]
|
// CHECK-SAME: #[[ATTRS1:[0-9]+]]
|
||||||
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr),
|
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr),
|
||||||
// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10,
|
// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10,
|
||||||
// CHECK-SAME: i32 %[[THREAD_NUM:.*]], i32 0)
|
// CHECK-SAME: i32 %[[THREAD_NUM:.*]], i8 0)
|
||||||
|
|
||||||
// CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] {
|
// CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] {
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
|
|||||||
// CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
|
// CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
|
||||||
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
|
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
|
||||||
// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
|
// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
|
||||||
// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0)
|
// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i8 0)
|
||||||
|
|
||||||
// CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
|
// CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
|
||||||
// CHECK: %[[TMP0:.*]] = urem i32 %[[LOOP_CNT]], 100
|
// CHECK: %[[TMP0:.*]] = urem i32 %[[LOOP_CNT]], 100
|
||||||
|
@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
|
|||||||
// CHECK: %[[GEP:.*]] = getelementptr { ptr }, ptr addrspace(5) %[[STRUCTARG]], i32 0, i32 0
|
// CHECK: %[[GEP:.*]] = getelementptr { ptr }, ptr addrspace(5) %[[STRUCTARG]], i32 0, i32 0
|
||||||
// CHECK: store ptr %[[ARG0]], ptr addrspace(5) %[[GEP]], align 8
|
// CHECK: store ptr %[[ARG0]], ptr addrspace(5) %[[GEP]], align 8
|
||||||
// CHECK: %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads()
|
// CHECK: %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads()
|
||||||
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0)
|
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0, i8 0)
|
||||||
|
|
||||||
// CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
|
// CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
|
||||||
// CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0
|
// CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0
|
||||||
@ -46,6 +46,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
|
|||||||
// CHECK: store i32 %[[VAL0:.*]], ptr %[[GEP3]], align 4
|
// CHECK: store i32 %[[VAL0:.*]], ptr %[[GEP3]], align 4
|
||||||
|
|
||||||
// CHECK: define void @[[FUNC_EMPTY_WSLOOP:.*]]()
|
// CHECK: define void @[[FUNC_EMPTY_WSLOOP:.*]]()
|
||||||
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
|
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0, i8 0)
|
||||||
|
|
||||||
// CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
|
// CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
|
||||||
|
@ -698,7 +698,7 @@ template <typename Ty> class StaticLoopChunker {
|
|||||||
static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
|
static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg,
|
||||||
Ty NumBlocks, Ty BId, Ty NumThreads,
|
Ty NumBlocks, Ty BId, Ty NumThreads,
|
||||||
Ty TId, Ty NumIters,
|
Ty TId, Ty NumIters,
|
||||||
bool OneIterationPerThread) {
|
uint8_t OneIterationPerThread) {
|
||||||
Ty KernelIteration = NumBlocks * NumThreads;
|
Ty KernelIteration = NumBlocks * NumThreads;
|
||||||
|
|
||||||
// Start index in the normalized space.
|
// Start index in the normalized space.
|
||||||
@ -729,7 +729,7 @@ template <typename Ty> class StaticLoopChunker {
|
|||||||
Ty BlockChunk, Ty NumBlocks, Ty BId,
|
Ty BlockChunk, Ty NumBlocks, Ty BId,
|
||||||
Ty ThreadChunk, Ty NumThreads, Ty TId,
|
Ty ThreadChunk, Ty NumThreads, Ty TId,
|
||||||
Ty NumIters,
|
Ty NumIters,
|
||||||
bool OneIterationPerThread) {
|
uint8_t OneIterationPerThread) {
|
||||||
Ty KernelIteration = NumBlocks * BlockChunk;
|
Ty KernelIteration = NumBlocks * BlockChunk;
|
||||||
|
|
||||||
// Start index in the chunked space.
|
// Start index in the chunked space.
|
||||||
@ -767,8 +767,18 @@ template <typename Ty> class StaticLoopChunker {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
/// Worksharing `for`-loop.
|
/// Worksharing `for`-loop.
|
||||||
|
/// \param[in] Loc Description of source location
|
||||||
|
/// \param[in] LoopBody Function which corresponds to loop body
|
||||||
|
/// \param[in] Arg Pointer to struct which contains loop body args
|
||||||
|
/// \param[in] NumIters Number of loop iterations
|
||||||
|
/// \param[in] NumThreads Number of GPU threads
|
||||||
|
/// \param[in] ThreadChunk Size of thread chunk
|
||||||
|
/// \param[in] OneIterationPerThread If true/nonzero, each thread executes
|
||||||
|
/// only one loop iteration or one thread chunk. This avoids an outer loop
|
||||||
|
/// over all loop iterations/chunks.
|
||||||
static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
|
static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
|
||||||
Ty NumIters, Ty NumThreads, Ty ThreadChunk) {
|
Ty NumIters, Ty NumThreads, Ty ThreadChunk,
|
||||||
|
uint8_t OneIterationPerThread) {
|
||||||
ASSERT(NumIters >= 0, "Bad iteration count");
|
ASSERT(NumIters >= 0, "Bad iteration count");
|
||||||
ASSERT(ThreadChunk >= 0, "Bad thread count");
|
ASSERT(ThreadChunk >= 0, "Bad thread count");
|
||||||
|
|
||||||
@ -790,12 +800,13 @@ public:
|
|||||||
|
|
||||||
// If we know we have more threads than iterations we can indicate that to
|
// If we know we have more threads than iterations we can indicate that to
|
||||||
// avoid an outer loop.
|
// avoid an outer loop.
|
||||||
bool OneIterationPerThread = false;
|
|
||||||
if (config::getAssumeThreadsOversubscription()) {
|
if (config::getAssumeThreadsOversubscription()) {
|
||||||
ASSERT(NumThreads >= NumIters, "Broken assumption");
|
|
||||||
OneIterationPerThread = true;
|
OneIterationPerThread = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (OneIterationPerThread)
|
||||||
|
ASSERT(NumThreads >= NumIters, "Broken assumption");
|
||||||
|
|
||||||
if (ThreadChunk != 1)
|
if (ThreadChunk != 1)
|
||||||
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
|
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
|
||||||
ThreadChunk, NumThreads, TId, NumIters,
|
ThreadChunk, NumThreads, TId, NumIters,
|
||||||
@ -806,8 +817,17 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Worksharing `distribute`-loop.
|
/// Worksharing `distribute`-loop.
|
||||||
|
/// \param[in] Loc Description of source location
|
||||||
|
/// \param[in] LoopBody Function which corresponds to loop body
|
||||||
|
/// \param[in] Arg Pointer to struct which contains loop body args
|
||||||
|
/// \param[in] NumIters Number of loop iterations
|
||||||
|
/// \param[in] BlockChunk Size of block chunk
|
||||||
|
/// \param[in] OneIterationPerThread If true/nonzero, each thread executes
|
||||||
|
/// only one loop iteration or one thread chunk. This avoids an outer loop
|
||||||
|
/// over all loop iterations/chunks.
|
||||||
static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
|
static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
|
||||||
Ty NumIters, Ty BlockChunk) {
|
Ty NumIters, Ty BlockChunk,
|
||||||
|
uint8_t OneIterationPerThread) {
|
||||||
ASSERT(icv::Level == 0, "Bad distribute");
|
ASSERT(icv::Level == 0, "Bad distribute");
|
||||||
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
|
ASSERT(icv::ActiveLevel == 0, "Bad distribute");
|
||||||
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
|
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
|
||||||
@ -831,12 +851,13 @@ public:
|
|||||||
|
|
||||||
// If we know we have more blocks than iterations we can indicate that to
|
// If we know we have more blocks than iterations we can indicate that to
|
||||||
// avoid an outer loop.
|
// avoid an outer loop.
|
||||||
bool OneIterationPerThread = false;
|
|
||||||
if (config::getAssumeTeamsOversubscription()) {
|
if (config::getAssumeTeamsOversubscription()) {
|
||||||
ASSERT(NumBlocks >= NumIters, "Broken assumption");
|
|
||||||
OneIterationPerThread = true;
|
OneIterationPerThread = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (OneIterationPerThread)
|
||||||
|
ASSERT(NumBlocks >= NumIters, "Broken assumption");
|
||||||
|
|
||||||
if (BlockChunk != NumThreads)
|
if (BlockChunk != NumThreads)
|
||||||
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
|
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
|
||||||
ThreadChunk, NumThreads, TId, NumIters,
|
ThreadChunk, NumThreads, TId, NumIters,
|
||||||
@ -852,9 +873,20 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Worksharing `distribute parallel for`-loop.
|
/// Worksharing `distribute parallel for`-loop.
|
||||||
|
/// \param[in] Loc Description of source location
|
||||||
|
/// \param[in] LoopBody Function which corresponds to loop body
|
||||||
|
/// \param[in] Arg Pointer to struct which contains loop body args
|
||||||
|
/// \param[in] NumIters Number of loop iterations
|
||||||
|
/// \param[in] NumThreads Number of GPU threads
|
||||||
|
/// \param[in] BlockChunk Size of block chunk
|
||||||
|
/// \param[in] ThreadChunk Size of thread chunk
|
||||||
|
/// \param[in] OneIterationPerThread If true/nonzero, each thread executes
|
||||||
|
/// only one loop iteration or one thread chunk. This avoids an outer loop
|
||||||
|
/// over all loop iterations/chunks.
|
||||||
static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
|
static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
|
||||||
void *Arg, Ty NumIters, Ty NumThreads,
|
void *Arg, Ty NumIters, Ty NumThreads,
|
||||||
Ty BlockChunk, Ty ThreadChunk) {
|
Ty BlockChunk, Ty ThreadChunk,
|
||||||
|
uint8_t OneIterationPerThread) {
|
||||||
ASSERT(icv::Level == 1, "Bad distribute");
|
ASSERT(icv::Level == 1, "Bad distribute");
|
||||||
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
|
ASSERT(icv::ActiveLevel == 1, "Bad distribute");
|
||||||
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
|
ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");
|
||||||
@ -882,13 +914,14 @@ public:
|
|||||||
|
|
||||||
// If we know we have more threads (across all blocks) than iterations we
|
// If we know we have more threads (across all blocks) than iterations we
|
||||||
// can indicate that to avoid an outer loop.
|
// can indicate that to avoid an outer loop.
|
||||||
bool OneIterationPerThread = false;
|
|
||||||
if (config::getAssumeTeamsOversubscription() &
|
if (config::getAssumeTeamsOversubscription() &
|
||||||
config::getAssumeThreadsOversubscription()) {
|
config::getAssumeThreadsOversubscription()) {
|
||||||
OneIterationPerThread = true;
|
OneIterationPerThread = true;
|
||||||
ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (OneIterationPerThread)
|
||||||
|
ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption");
|
||||||
|
|
||||||
if (BlockChunk != NumThreads || ThreadChunk != 1)
|
if (BlockChunk != NumThreads || ThreadChunk != 1)
|
||||||
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
|
NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId,
|
||||||
ThreadChunk, NumThreads, TId, NumIters,
|
ThreadChunk, NumThreads, TId, NumIters,
|
||||||
@ -909,22 +942,24 @@ public:
|
|||||||
[[gnu::flatten, clang::always_inline]] void \
|
[[gnu::flatten, clang::always_inline]] void \
|
||||||
__kmpc_distribute_for_static_loop##BW( \
|
__kmpc_distribute_for_static_loop##BW( \
|
||||||
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
|
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
|
||||||
TY num_threads, TY block_chunk, TY thread_chunk) { \
|
TY num_threads, TY block_chunk, TY thread_chunk, \
|
||||||
|
uint8_t one_iteration_per_thread) { \
|
||||||
ompx::StaticLoopChunker<TY>::DistributeFor( \
|
ompx::StaticLoopChunker<TY>::DistributeFor( \
|
||||||
loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \
|
loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \
|
||||||
|
one_iteration_per_thread); \
|
||||||
} \
|
} \
|
||||||
[[gnu::flatten, clang::always_inline]] void \
|
[[gnu::flatten, clang::always_inline]] void \
|
||||||
__kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
|
__kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \
|
||||||
void *arg, TY num_iters, \
|
void *arg, TY num_iters, TY block_chunk, \
|
||||||
TY block_chunk) { \
|
uint8_t one_iteration_per_thread) { \
|
||||||
ompx::StaticLoopChunker<TY>::Distribute(loc, fn, arg, num_iters, \
|
ompx::StaticLoopChunker<TY>::Distribute( \
|
||||||
block_chunk); \
|
loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \
|
||||||
} \
|
} \
|
||||||
[[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
|
[[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \
|
||||||
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
|
IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \
|
||||||
TY num_threads, TY thread_chunk) { \
|
TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \
|
||||||
ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
|
ompx::StaticLoopChunker<TY>::For(loc, fn, arg, num_iters, num_threads, \
|
||||||
thread_chunk); \
|
thread_chunk, one_iteration_per_thread); \
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user