llvm-project/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
Michael Kruse fe0e5b3e43 [Polly] Insert !dbg metadata for emitted CallInsts.
The IR Verifier requires that every call instruction to an inlineable
function (among other things, its implementation must be visible in the
translation unit) must also have !dbg metadata attached to it. When
parallelizing, Polly emits calls to OpenMP runtime function out of thin
air, or at least not directly derived from a bounded list of previous
instruction. While we could search for instructions in the SCoP that has
some debug info attached to it, there is no guarantee that we find any.
Our solution is to generate a new DILocation that points to line 0 to
represent optimized code.

The OpenMP function implementation is usually not available in the
user's translation unit, but can become visible in an LTO build. For
the bug to appear, libomp must also be built with debug symbols.

IMHO, the IR verifier rule is too strict. Runtime functions can
also be inserted by other optimization passes, such as
LoopIdiomRecognize. When inserting a call to e.g. memset, it uses the
DebugLoc from a StoreInst from the unoptimized code. It is not
required to have !dbg metadata attached either.

Fixes #56692
2022-07-26 19:43:53 -05:00

568 lines
22 KiB
C++

//===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains functions to create parallel loops as LLVM-IR.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/LoopGeneratorsKMP.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Module.h"
using namespace llvm;
using namespace polly;
void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
const std::string Name = "__kmpc_fork_call";
Function *F = M->getFunction(Name);
Type *KMPCMicroTy = StructType::getTypeByName(M->getContext(), "kmpc_micro");
if (!KMPCMicroTy) {
// void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...)
Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(),
Builder.getInt32Ty()->getPointerTo()};
KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true);
}
// If F is not available, declare it.
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
KMPCMicroTy->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast(
SubFn, KMPCMicroTy->getPointerTo());
Value *Args[] = {SourceLocationInfo,
Builder.getInt32(4) /* Number of arguments (w/o Task) */,
Task,
LB,
UB,
Stride,
SubFnParam};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
// Inform OpenMP runtime about the number of threads if greater than zero
if (PollyNumThreads > 0) {
Value *GlobalThreadID = createCallGlobalThreadNum();
createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads));
}
// Tell the runtime we start a parallel loop
createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
}
Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const {
std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(),
Builder.getInt32Ty()->getPointerTo(),
LongType,
LongType,
LongType,
Builder.getInt8PtrTy()};
FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
Function *SubFn = Function::Create(FT, Function::InternalLinkage,
F->getName() + "_polly_subfn", M);
// Name the function's arguments
Function::arg_iterator AI = SubFn->arg_begin();
AI->setName("polly.kmpc.global_tid");
std::advance(AI, 1);
AI->setName("polly.kmpc.bound_tid");
std::advance(AI, 1);
AI->setName("polly.kmpc.lb");
std::advance(AI, 1);
AI->setName("polly.kmpc.ub");
std::advance(AI, 1);
AI->setName("polly.kmpc.inc");
std::advance(AI, 1);
AI->setName("polly.kmpc.shared");
return SubFn;
}
// Create a subfunction of the following (preliminary) structure:
//
// PrevBB
// |
// v
// HeaderBB
// / | _____
// / v v |
// / PreHeaderBB |
// | | |
// | v |
// | CheckNextBB |
// \ | \_____/
// \ |
// v v
// ExitBB
//
// HeaderBB will hold allocations, loading of variables and kmp-init calls.
// CheckNextBB will check for more work (dynamic / static chunked) or will be
// empty (static non chunked).
// If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB.
// PreHeaderBB loads the new boundaries (& will lead to the loop body later on).
// Just like CheckNextBB: PreHeaderBB is (preliminary) empty in the static non
// chunked scheduling case. ExitBB marks the end of the parallel execution.
// The possibly empty BasicBlocks will automatically be removed.
std::tuple<Value *, Function *>
ParallelLoopGeneratorKMP::createSubFn(Value *SequentialLoopStride,
AllocaInst *StructData,
SetVector<Value *> Data, ValueMapT &Map) {
Function *SubFn = createSubFnDefinition();
LLVMContext &Context = SubFn->getContext();
// Store the previous basic block.
BasicBlock *PrevBB = Builder.GetInsertBlock();
// Create basic blocks.
BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
BasicBlock *CheckNextBB =
BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
BasicBlock *PreHeaderBB =
BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);
DT.addNewBlock(HeaderBB, PrevBB);
DT.addNewBlock(ExitBB, HeaderBB);
DT.addNewBlock(CheckNextBB, HeaderBB);
DT.addNewBlock(PreHeaderBB, HeaderBB);
// Fill up basic block HeaderBB.
Builder.SetInsertPoint(HeaderBB);
Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
"polly.par.lastIterPtr");
Value *StridePtr =
Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr");
// Get iterator for retrieving the previously defined parameters.
Function::arg_iterator AI = SubFn->arg_begin();
// First argument holds "global thread ID".
Value *IDPtr = &*AI;
// Skip "bound thread ID" since it is not used (but had to be defined).
std::advance(AI, 2);
// Move iterator to: LB, UB, Stride, Shared variable struct.
Value *LB = &*AI;
std::advance(AI, 1);
Value *UB = &*AI;
std::advance(AI, 1);
Value *Stride = &*AI;
std::advance(AI, 1);
Value *Shared = &*AI;
Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(),
"polly.par.userContext");
extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext,
Map);
const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4);
Value *ID = Builder.CreateAlignedLoad(Builder.getInt32Ty(), IDPtr, Alignment,
"polly.par.global_tid");
Builder.CreateAlignedStore(LB, LBPtr, Alignment);
Builder.CreateAlignedStore(UB, UBPtr, Alignment);
Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment);
Builder.CreateAlignedStore(Stride, StridePtr, Alignment);
// Subtract one as the upper bound provided by openmp is a < comparison
// whereas the codegenForSequential function creates a <= comparison.
Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1),
"polly.indvar.UBAdjusted");
Value *ChunkSize =
ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1));
OMPGeneralSchedulingType Scheduling =
getSchedType(PollyChunkSize, PollyScheduling);
switch (Scheduling) {
case OMPGeneralSchedulingType::Dynamic:
case OMPGeneralSchedulingType::Guided:
case OMPGeneralSchedulingType::Runtime:
// "DYNAMIC" scheduling types are handled below (including 'runtime')
{
UB = AdjustedUB;
createCallDispatchInit(ID, LB, UB, Stride, ChunkSize);
Value *HasWork =
createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
Value *HasIteration =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
Builder.getInt32(1), "polly.hasIteration");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
Builder.SetInsertPoint(CheckNextBB);
HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
HasIteration =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
Builder.getInt32(1), "polly.hasWork");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB");
}
break;
case OMPGeneralSchedulingType::StaticChunked:
case OMPGeneralSchedulingType::StaticNonChunked:
// "STATIC" scheduling types are handled below
{
Builder.CreateAlignedStore(AdjustedUB, UBPtr, Alignment);
createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize);
Value *ChunkedStride = Builder.CreateAlignedLoad(
LongType, StridePtr, Alignment, "polly.kmpc.stride");
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB.temp");
Value *UBInRange =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, UB, AdjustedUB,
"polly.indvar.UB.inRange");
UB = Builder.CreateSelect(UBInRange, UB, AdjustedUB, "polly.indvar.UB");
Builder.CreateAlignedStore(UB, UBPtr, Alignment);
Value *HasIteration = Builder.CreateICmp(
llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
if (Scheduling == OMPGeneralSchedulingType::StaticChunked) {
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB.entry");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB.entry");
}
Builder.SetInsertPoint(CheckNextBB);
if (Scheduling == OMPGeneralSchedulingType::StaticChunked) {
Value *NextLB =
Builder.CreateAdd(LB, ChunkedStride, "polly.indvar.nextLB");
Value *NextUB = Builder.CreateAdd(UB, ChunkedStride);
Value *NextUBOutOfBounds =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SGT, NextUB,
AdjustedUB, "polly.indvar.nextUB.outOfBounds");
NextUB = Builder.CreateSelect(NextUBOutOfBounds, AdjustedUB, NextUB,
"polly.indvar.nextUB");
Builder.CreateAlignedStore(NextLB, LBPtr, Alignment);
Builder.CreateAlignedStore(NextUB, UBPtr, Alignment);
Value *HasWork =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, NextLB,
AdjustedUB, "polly.hasWork");
Builder.CreateCondBr(HasWork, PreHeaderBB, ExitBB);
} else {
Builder.CreateBr(ExitBB);
}
Builder.SetInsertPoint(PreHeaderBB);
}
break;
}
Builder.CreateBr(CheckNextBB);
Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
BasicBlock *AfterBB;
Value *IV = createLoop(LB, UB, SequentialLoopStride, Builder, LI, DT, AfterBB,
ICmpInst::ICMP_SLE, nullptr, true,
/* UseGuard */ false);
BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
// Add code to terminate this subfunction.
Builder.SetInsertPoint(ExitBB);
// Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call
if (Scheduling == OMPGeneralSchedulingType::StaticChunked ||
Scheduling == OMPGeneralSchedulingType::StaticNonChunked) {
createCallStaticFini(ID);
}
Builder.CreateRetVoid();
Builder.SetInsertPoint(&*LoopBody);
return std::make_tuple(IV, SubFn);
}
Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() {
const std::string Name = "__kmpc_global_thread_num";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
CallInst *Call = Builder.CreateCall(F, {SourceLocationInfo});
Call->setDebugLoc(DLGenerated);
return Call;
}
void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID,
Value *NumThreads) {
const std::string Name = "__kmpc_push_num_threads";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
Builder.getInt32Ty()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID,
Value *IsLastPtr,
Value *LBPtr, Value *UBPtr,
Value *StridePtr,
Value *ChunkSize) {
const std::string Name =
is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty(),
Builder.getInt32Ty()->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType,
LongType};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
// The parameter 'ChunkSize' will hold strictly positive integer values,
// regardless of PollyChunkSize's value
Value *Args[] = {
SourceLocationInfo,
GlobalThreadID,
Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
IsLastPtr,
LBPtr,
UBPtr,
StridePtr,
ConstantInt::get(LongType, 1),
ChunkSize};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) {
const std::string Name = "__kmpc_for_static_fini";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID,
Value *LB, Value *UB,
Value *Inc,
Value *ChunkSize) {
const std::string Name =
is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty(),
LongType,
LongType,
LongType,
LongType};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
// The parameter 'ChunkSize' will hold strictly positive integer values,
// regardless of PollyChunkSize's value
Value *Args[] = {
SourceLocationInfo,
GlobalThreadID,
Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
LB,
UB,
Inc,
ChunkSize};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
}
Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID,
Value *IsLastPtr,
Value *LBPtr,
Value *UBPtr,
Value *StridePtr) {
const std::string Name =
is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty()->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr,
StridePtr};
CallInst *Call = Builder.CreateCall(F, Args);
Call->setDebugLoc(DLGenerated);
return Call;
}
// TODO: This function currently creates a source location dummy. It might be
// necessary to (actually) provide information, in the future.
GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() {
const std::string LocName = ".loc.dummy";
GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName);
if (SourceLocDummy == nullptr) {
const std::string StructName = "struct.ident_t";
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), StructName);
// If the ident_t StructType is not available, declare it.
// in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* }
if (!IdentTy) {
Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(),
Builder.getInt32Ty(), Builder.getInt32Ty(),
Builder.getInt8PtrTy()};
IdentTy =
StructType::create(M->getContext(), LocMembers, StructName, false);
}
const auto ArrayType =
llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23);
// Global Variable Definitions
GlobalVariable *StrVar =
new GlobalVariable(*M, ArrayType, true, GlobalValue::PrivateLinkage,
nullptr, ".str.ident");
StrVar->setAlignment(llvm::Align(1));
SourceLocDummy = new GlobalVariable(
*M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName);
SourceLocDummy->setAlignment(llvm::Align(8));
// Constant Definitions
Constant *InitStr = ConstantDataArray::getString(
M->getContext(), "Source location dummy.", true);
Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP(
ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)}));
Constant *LocInitStruct = ConstantStruct::get(
IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0),
Builder.getInt32(0), StrPtr});
// Initialize variables
StrVar->setInitializer(InitStr);
SourceLocDummy->setInitializer(LocInitStruct);
}
return SourceLocDummy;
}
bool ParallelLoopGeneratorKMP::is64BitArch() {
return (LongType->getIntegerBitWidth() == 64);
}
OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType(
int ChunkSize, OMPGeneralSchedulingType Scheduling) const {
if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked)
return OMPGeneralSchedulingType::StaticNonChunked;
return Scheduling;
}