diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index ea35b298c020..17a9a20c9b43 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -38,6 +38,13 @@ class ExtendedValue; class MutableBoxValue; class BoxValue; +/// Get the integer type with a pointer size. +inline mlir::Type getIntPtrType(mlir::OpBuilder &builder) { + // TODO: Delay the need of such type until codegen or find a way to use + // llvm::DataLayout::getPointerSizeInBits here. + return builder.getI64Type(); +} + //===----------------------------------------------------------------------===// // FirOpBuilder //===----------------------------------------------------------------------===// @@ -143,11 +150,7 @@ public: /// Get the integer type whose bit width corresponds to the width of pointer /// types, or is bigger. - mlir::Type getIntPtrType() { - // TODO: Delay the need of such type until codegen or find a way to use - // llvm::DataLayout::getPointerSizeInBits here. - return getI64Type(); - } + mlir::Type getIntPtrType() { return fir::getIntPtrType(*this); } /// Wrap `str` to a SymbolRefAttr. mlir::SymbolRefAttr getSymbolRefAttr(llvm::StringRef str) { @@ -712,6 +715,11 @@ fir::BoxValue createBoxValue(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value createNullBoxProc(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type boxType); +/// Convert a value to a new type. Return the value directly if it has the right +/// type. +mlir::Value createConvert(mlir::OpBuilder &, mlir::Location, mlir::Type, + mlir::Value); + /// Set internal linkage attribute on a function. void setInternalLinkage(mlir::func::FuncOp); diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 5b03806614f9..89c13fa7cebe 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -124,6 +124,13 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, Indeed, a user would likely expect a good Fortran compiler to perform such an optimization. + Stack allocations have a maximum lifetime concept: their uses must not + exceed the lifetime of the closest parent operation with the + AutomaticAllocationScope trait, IsIsolatedFromAbove trait, or + LoopLikeOpInterface trait. This restriction is meant to ease the + insertion of stack save and restore operations, and to ease the conversion + of stack allocation into heap allocation. + Until Fortran 2018, procedures defaulted to non-recursive. A legal implementation could therefore convert stack allocations to global allocations. Such a conversion effectively adds the SAVE attribute to all @@ -183,11 +190,17 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, mlir::Type getAllocatedType(); bool hasLenParams() { return !getTypeparams().empty(); } bool hasShapeOperands() { return !getShape().empty(); } + bool isDynamic() {return hasLenParams() || hasShapeOperands();} unsigned numLenParams() { return getTypeparams().size(); } operand_range getLenParams() { return getTypeparams(); } unsigned numShapeOperands() { return getShape().size(); } operand_range getShapeOperands() { return getShape(); } static mlir::Type getRefTy(mlir::Type ty); + /// Is this an operation that owns the alloca directly made in its region? + static bool ownsNestedAlloca(mlir::Operation* op); + /// Get the parent region that owns this alloca. Nullptr if none can be + /// identified. + mlir::Region* getOwnerRegion(); }]; } diff --git a/flang/include/flang/Optimizer/Transforms/MemoryUtils.h b/flang/include/flang/Optimizer/Transforms/MemoryUtils.h new file mode 100644 index 000000000000..92a519cd0c83 --- /dev/null +++ b/flang/include/flang/Optimizer/Transforms/MemoryUtils.h @@ -0,0 +1,62 @@ +//===-- Optimizer/Transforms/MemoryUtils.h ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// +// +// This file defines a utility to replace fir.alloca by dynamic allocation and +// deallocation. The exact kind of dynamic allocation is left to be defined by +// the utility user via callbacks (could be fir.allocmem or custom runtime +// calls). +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_MEMORYUTILS_H +#define FORTRAN_OPTIMIZER_TRANSFORMS_MEMORYUTILS_H + +#include "flang/Optimizer/Dialect/FIROps.h" + +namespace mlir { +class RewriterBase; +} + +namespace fir { + +/// Type of callbacks that indicate if a given fir.alloca must be +/// rewritten. +using MustRewriteCallBack = llvm::function_ref; + +/// Type of callbacks that produce the replacement for a given fir.alloca. +/// It is provided extra information about the dominance of the deallocation +/// points that have been identified, and may refuse to replace the alloca, +/// even if the MustRewriteCallBack previously returned true, in which case +/// it should return a null value. +/// The callback should not delete the alloca, the utility will do it. +using AllocaRewriterCallBack = llvm::function_ref; +/// Type of callbacks that must generate deallocation of storage obtained via +/// AllocaRewriterCallBack calls. +using DeallocCallBack = + llvm::function_ref; + +/// Utility to replace fir.alloca by dynamic allocations inside \p parentOp. +/// \p MustRewriteCallBack lets the user control which fir.alloca should be +/// replaced. \p AllocaRewriterCallBack lets the user define how the new memory +/// should be allocated. \p DeallocCallBack lets the user decide how the memory +/// should be deallocated. The boolean result indicates if the utility succeeded +/// to replace all fir.alloca as requested by the user. Causes of failures are +/// the presence of unregistered operations, or OpenMP/ACC recipe operations +/// that return memory allocated inside their region. +bool replaceAllocas(mlir::RewriterBase &rewriter, mlir::Operation *parentOp, + MustRewriteCallBack, AllocaRewriterCallBack, + DeallocCallBack); + +} // namespace fir + +#endif // FORTRAN_OPTIMIZER_TRANSFORMS_MEMORYUTILS_H diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index 2ea302d18801..2961df96b3ca 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -455,15 +455,21 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics( return createConvert(loc, toTy, val); } -mlir::Value fir::FirOpBuilder::createConvert(mlir::Location loc, - mlir::Type toTy, mlir::Value val) { +mlir::Value fir::factory::createConvert(mlir::OpBuilder &builder, + mlir::Location loc, mlir::Type toTy, + mlir::Value val) { if (val.getType() != toTy) { assert(!fir::isa_derived(toTy)); - return create(loc, toTy, val); + return builder.create(loc, toTy, val); } return val; } +mlir::Value fir::FirOpBuilder::createConvert(mlir::Location loc, + mlir::Type toTy, mlir::Value val) { + return fir::factory::createConvert(*this, loc, toTy, val); +} + void fir::FirOpBuilder::createStoreWithConvert(mlir::Location loc, mlir::Value val, mlir::Value addr) { diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index a499a6e4f8d0..9e6b88041ba6 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -275,6 +275,27 @@ llvm::LogicalResult fir::AllocaOp::verify() { return mlir::success(); } +bool fir::AllocaOp::ownsNestedAlloca(mlir::Operation *op) { + return op->hasTrait() || + op->hasTrait() || + mlir::isa(*op); +} + +mlir::Region *fir::AllocaOp::getOwnerRegion() { + mlir::Operation *currentOp = getOperation(); + while (mlir::Operation *parentOp = currentOp->getParentOp()) { + // If the operation was not registered, inquiries about its traits will be + // incorrect and it is not possible to reason about the operation. This + // should not happen in a normal Fortran compilation flow, but be foolproof. + if (!parentOp->isRegistered()) + return nullptr; + if (fir::AllocaOp::ownsNestedAlloca(parentOp)) + return currentOp->getParentRegion(); + currentOp = parentOp; + } + return nullptr; +} + //===----------------------------------------------------------------------===// // AllocMemOp //===----------------------------------------------------------------------===// diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 94d94398d696..310830424089 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -10,6 +10,7 @@ add_flang_library(FIRTransforms ControlFlowConverter.cpp ArrayValueCopy.cpp ExternalNameConversion.cpp + MemoryUtils.cpp MemoryAllocation.cpp StackArrays.cpp MemRefDataFlowOpt.cpp diff --git a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp index 03b1ae89428a..3f308a8f4b56 100644 --- a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp +++ b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp @@ -9,6 +9,7 @@ #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Transforms/MemoryUtils.h" #include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Diagnostics.h" @@ -27,50 +28,18 @@ namespace fir { // Number of elements in an array does not determine where it is allocated. static constexpr std::size_t unlimitedArraySize = ~static_cast(0); -namespace { -class ReturnAnalysis { -public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReturnAnalysis) - - ReturnAnalysis(mlir::Operation *op) { - if (auto func = mlir::dyn_cast(op)) - for (mlir::Block &block : func) - for (mlir::Operation &i : block) - if (mlir::isa(i)) { - returnMap[op].push_back(&i); - break; - } - } - - llvm::SmallVector getReturns(mlir::Operation *func) const { - auto iter = returnMap.find(func); - if (iter != returnMap.end()) - return iter->second; - return {}; - } - -private: - llvm::DenseMap> - returnMap; -}; -} // namespace - /// Return `true` if this allocation is to remain on the stack (`fir.alloca`). /// Otherwise the allocation should be moved to the heap (`fir.allocmem`). static inline bool -keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry, +keepStackAllocation(fir::AllocaOp alloca, const fir::MemoryAllocationOptOptions &options) { - // Limitation: only arrays allocated on the stack in the entry block are - // considered for now. - // TODO: Generalize the algorithm and placement of the freemem nodes. - if (alloca->getBlock() != entry) - return true; + // Move all arrays and character with runtime determined size to the heap. + if (options.dynamicArrayOnHeap && alloca.isDynamic()) + return false; + // TODO: use data layout to reason in terms of byte size to cover all "big" + // entities, which may be scalar derived types. if (auto seqTy = mlir::dyn_cast(alloca.getInType())) { - if (fir::hasDynamicSize(seqTy)) { - // Move all arrays with runtime determined size to the heap. - if (options.dynamicArrayOnHeap) - return false; - } else { + if (!fir::hasDynamicSize(seqTy)) { std::int64_t numberOfElements = 1; for (std::int64_t i : seqTy.getShape()) { numberOfElements *= i; @@ -82,8 +51,6 @@ keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry, // the heap. if (static_cast(numberOfElements) > options.maxStackArraySize) { - LLVM_DEBUG(llvm::dbgs() - << "memory allocation opt: found " << alloca << '\n'); return false; } } @@ -91,49 +58,30 @@ keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry, return true; } -namespace { -class AllocaOpConversion : public mlir::OpRewritePattern { -public: - using OpRewritePattern::OpRewritePattern; +static mlir::Value genAllocmem(mlir::OpBuilder &builder, fir::AllocaOp alloca, + bool deallocPointsDominateAlloc) { + mlir::Type varTy = alloca.getInType(); + auto unpackName = [](std::optional opt) -> llvm::StringRef { + if (opt) + return *opt; + return {}; + }; + llvm::StringRef uniqName = unpackName(alloca.getUniqName()); + llvm::StringRef bindcName = unpackName(alloca.getBindcName()); + auto heap = builder.create(alloca.getLoc(), varTy, uniqName, + bindcName, alloca.getTypeparams(), + alloca.getShape()); + LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: replaced " << alloca + << " with " << heap << '\n'); + return heap; +} - AllocaOpConversion(mlir::MLIRContext *ctx, - llvm::ArrayRef rets) - : OpRewritePattern(ctx), returnOps(rets) {} - - llvm::LogicalResult - matchAndRewrite(fir::AllocaOp alloca, - mlir::PatternRewriter &rewriter) const override { - auto loc = alloca.getLoc(); - mlir::Type varTy = alloca.getInType(); - auto unpackName = - [](std::optional opt) -> llvm::StringRef { - if (opt) - return *opt; - return {}; - }; - auto uniqName = unpackName(alloca.getUniqName()); - auto bindcName = unpackName(alloca.getBindcName()); - auto heap = rewriter.create( - loc, varTy, uniqName, bindcName, alloca.getTypeparams(), - alloca.getShape()); - auto insPt = rewriter.saveInsertionPoint(); - for (mlir::Operation *retOp : returnOps) { - rewriter.setInsertionPoint(retOp); - [[maybe_unused]] auto free = rewriter.create(loc, heap); - LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: add free " << free - << " for " << heap << '\n'); - } - rewriter.restoreInsertionPoint(insPt); - rewriter.replaceOpWithNewOp( - alloca, fir::ReferenceType::get(varTy), heap); - LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: replaced " << alloca - << " with " << heap << '\n'); - return mlir::success(); - } - -private: - llvm::ArrayRef returnOps; -}; +static void genFreemem(mlir::Location loc, mlir::OpBuilder &builder, + mlir::Value allocmem) { + [[maybe_unused]] auto free = builder.create(loc, allocmem); + LLVM_DEBUG(llvm::dbgs() << "memory allocation opt: add free " << free + << " for " << allocmem << '\n'); +} /// This pass can reclassify memory allocations (fir.alloca, fir.allocmem) based /// on heuristics and settings. The intention is to allow better performance and @@ -144,6 +92,7 @@ private: /// make it a heap allocation. /// 2. If a stack allocation is an array with a runtime evaluated size make /// it a heap allocation. +namespace { class MemoryAllocationOpt : public fir::impl::MemoryAllocationOptBase { public: @@ -184,23 +133,17 @@ public: // If func is a declaration, skip it. if (func.empty()) return; - - const auto &analysis = getAnalysis(); - - target.addLegalDialect(); - target.addDynamicallyLegalOp([&](fir::AllocaOp alloca) { - return keepStackAllocation(alloca, &func.front(), options); - }); - - llvm::SmallVector returnOps = analysis.getReturns(func); - patterns.insert(context, returnOps); - if (mlir::failed( - mlir::applyPartialConversion(func, target, std::move(patterns)))) { - mlir::emitError(func.getLoc(), - "error in memory allocation optimization\n"); - signalPassFailure(); - } + auto tryReplacing = [&](fir::AllocaOp alloca) { + bool res = !keepStackAllocation(alloca, options); + if (res) { + LLVM_DEBUG(llvm::dbgs() + << "memory allocation opt: found " << alloca << '\n'); + } + return res; + }; + mlir::IRRewriter rewriter(context); + fir::replaceAllocas(rewriter, func.getOperation(), tryReplacing, + genAllocmem, genFreemem); } private: diff --git a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp new file mode 100644 index 000000000000..1f8edf851de9 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp @@ -0,0 +1,287 @@ +//===- MemoryUtils.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Transforms/MemoryUtils.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Dominance.h" +#include "llvm/ADT/STLExtras.h" + +namespace { +/// Helper class to detect if an alloca is inside an mlir::Block that can be +/// reached again before its deallocation points via block successors. This +/// analysis is only valid if the deallocation points are inside (or nested +/// inside) the same region as alloca because it does not consider region CFG +/// (for instance, the block inside a fir.do_loop is obviously inside a loop, +/// but is not a loop formed by blocks). The dominance of the alloca on its +/// deallocation points implies this pre-condition (although it is more +/// restrictive). +class BlockCycleDetector { +public: + bool allocaIsInCycle(fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints); + +private: + // Cache for blocks owning alloca that have been analyzed. In many Fortran + // programs, allocas are usually made in the same blocks with no block cycles. + // So getting a fast "no" is beneficial. + llvm::DenseMap analyzed; +}; +} // namespace + +namespace { +class AllocaReplaceImpl { +public: + AllocaReplaceImpl(fir::AllocaRewriterCallBack allocaRewriter, + fir::DeallocCallBack deallocGenerator) + : allocaRewriter{allocaRewriter}, deallocGenerator{deallocGenerator} {} + bool replace(mlir::RewriterBase &, fir::AllocaOp); + +private: + mlir::Region *findDeallocationPointsAndOwner( + fir::AllocaOp alloca, + llvm::SmallVectorImpl &deallocationPoints); + bool + allocDominatesDealloc(fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints) { + return llvm::all_of(deallocationPoints, [&](mlir::Operation *deallocPoint) { + return this->dominanceInfo.properlyDominates(alloca.getOperation(), + deallocPoint); + }); + } + void + genIndirectDeallocation(mlir::RewriterBase &, fir::AllocaOp, + llvm::ArrayRef deallocationPoints, + mlir::Value replacement, mlir::Region &owningRegion); + +private: + fir::AllocaRewriterCallBack allocaRewriter; + fir::DeallocCallBack deallocGenerator; + mlir::DominanceInfo dominanceInfo; + BlockCycleDetector blockCycleDetector; +}; +} // namespace + +static bool +allocaIsInCycleImpl(mlir::Block *allocaBlock, + llvm::ArrayRef deallocationPoints) { + llvm::DenseSet seen; + // Insert the deallocation point blocks as "seen" so that the block + // traversal will stop at them. + for (mlir::Operation *deallocPoint : deallocationPoints) + seen.insert(deallocPoint->getBlock()); + if (seen.contains(allocaBlock)) + return false; + // Traverse the block successor graph starting by the alloca block. + llvm::SmallVector successors{allocaBlock}; + while (!successors.empty()) + for (mlir::Block *next : successors.pop_back_val()->getSuccessors()) { + if (next == allocaBlock) + return true; + if (auto pair = seen.insert(next); pair.second) + successors.push_back(next); + } + // The traversal did not reach the alloca block again. + return false; +} +bool BlockCycleDetector::allocaIsInCycle( + fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints) { + mlir::Block *allocaBlock = alloca->getBlock(); + auto analyzedPair = analyzed.try_emplace(allocaBlock, /*isInCycle=*/false); + bool alreadyAnalyzed = !analyzedPair.second; + bool &isInCycle = analyzedPair.first->second; + // Fast exit if block was already analyzed and no cycle was found. + if (alreadyAnalyzed && !isInCycle) + return false; + // If the analysis was not done generically for this block, run it and + // save the result. + if (!alreadyAnalyzed) + isInCycle = allocaIsInCycleImpl(allocaBlock, /*deallocationPoints*/ {}); + if (!isInCycle) + return false; + // If the generic analysis found a block loop, see if the deallocation + // point would be reached before reaching the block again. Do not + // cache that analysis that is specific to the deallocation points + // found for this alloca. + return allocaIsInCycleImpl(allocaBlock, deallocationPoints); +} + +static bool terminatorYieldsMemory(mlir::Operation &terminator) { + return llvm::any_of(terminator.getResults(), [](mlir::OpResult res) { + return fir::conformsWithPassByRef(res.getType()); + }); +} + +static bool isRegionTerminator(mlir::Operation &terminator) { + // Using ReturnLike trait is tempting but it is not set on + // all region terminator that matters (like omp::TerminatorOp that + // has no results). + // May be true for dead code. It is not a correctness issue and dead code can + // be eliminated by running region simplification before this utility is + // used. + // May also be true for unreachable like terminators (e.g., after an abort + // call related to Fortran STOP). This is also OK, the inserted deallocation + // will simply never be reached. It is easier for the rest of the code here + // to assume there is always at least one deallocation point, so keep + // unreachable terminators. + return !terminator.hasSuccessors(); +} + +mlir::Region *AllocaReplaceImpl::findDeallocationPointsAndOwner( + fir::AllocaOp alloca, + llvm::SmallVectorImpl &deallocationPoints) { + // Step 1: Identify the operation and region owning the alloca. + mlir::Region *owningRegion = alloca.getOwnerRegion(); + if (!owningRegion) + return nullptr; + mlir::Operation *owningOp = owningRegion->getParentOp(); + assert(owningOp && "region expected to be owned"); + // Step 2: Identify the exit points of the owning region, they are the default + // deallocation points. TODO: detect and use lifetime markers to get earlier + // deallocation points. + bool isOpenACCMPRecipe = mlir::isa(owningOp); + for (mlir::Block &block : owningRegion->getBlocks()) + if (mlir::Operation *terminator = block.getTerminator(); + isRegionTerminator(*terminator)) { + // FIXME: OpenACC and OpenMP privatization recipe are stand alone + // operation meant to be later "inlined", the value they return may + // be the address of a local alloca. It would be incorrect to insert + // deallocation before the terminator (this would introduce use after + // free once the recipe is inlined. + // This probably require redesign or special handling on the OpenACC/MP + // side. + if (isOpenACCMPRecipe && terminatorYieldsMemory(*terminator)) + return nullptr; + deallocationPoints.push_back(terminator); + } + // If no block terminators without successors have been found, this is + // an odd region we cannot reason about (never seen yet in FIR and + // mainstream dialects, but MLIR does not really prevent it). + if (deallocationPoints.empty()) + return nullptr; + + // Step 3: detect block based loops between the allocation and deallocation + // points, and add a deallocation point on the back edge to avoid memory + // leaks. + // The detection avoids doing region CFG analysis by assuming that there may + // be cycles if deallocation points are not dominated by the alloca. + // This leaves the cases where the deallocation points are in the same region + // as the alloca (or nested inside it). In which cases there may be a back + // edge between the alloca and the deallocation point via block successors. An + // analysis is run to detect those cases. + // When a loop is detected, the easiest solution to deallocate on the back + // edge is to store the allocated memory address in a variable (that dominates + // the loops) and to deallocate the address in that variable if it is set + // before executing the allocation. This strategy still leads to correct + // execution in the "false positive" cases. + // Hence, the alloca is added as a deallocation point when there is no + // dominance. Note that bringing lifetime markers above will reduce the + // false positives. + if (!allocDominatesDealloc(alloca, deallocationPoints) || + blockCycleDetector.allocaIsInCycle(alloca, deallocationPoints)) + deallocationPoints.push_back(alloca.getOperation()); + return owningRegion; +} + +void AllocaReplaceImpl::genIndirectDeallocation( + mlir::RewriterBase &rewriter, fir::AllocaOp alloca, + llvm::ArrayRef deallocationPoints, + mlir::Value replacement, mlir::Region &owningRegion) { + mlir::Location loc = alloca.getLoc(); + auto replacementInsertPoint = rewriter.saveInsertionPoint(); + // Create C pointer variable in the entry block to store the alloc + // and access it indirectly in the entry points that do not dominate. + rewriter.setInsertionPointToStart(&owningRegion.front()); + mlir::Type heapType = fir::HeapType::get(alloca.getInType()); + mlir::Value ptrVar = rewriter.create(loc, heapType); + mlir::Value nullPtr = rewriter.create(loc, heapType); + rewriter.create(loc, nullPtr, ptrVar); + // TODO: introducing a pointer compare op in FIR would help + // generating less IR here. + mlir::Type intPtrTy = fir::getIntPtrType(rewriter); + mlir::Value c0 = rewriter.create( + loc, intPtrTy, rewriter.getIntegerAttr(intPtrTy, 0)); + + // Store new storage address right after its creation. + rewriter.restoreInsertionPoint(replacementInsertPoint); + mlir::Value castReplacement = + fir::factory::createConvert(rewriter, loc, heapType, replacement); + rewriter.create(loc, castReplacement, ptrVar); + + // Generate conditional deallocation at every deallocation point. + auto genConditionalDealloc = [&](mlir::Location loc) { + mlir::Value ptrVal = rewriter.create(loc, ptrVar); + mlir::Value ptrToInt = + rewriter.create(loc, intPtrTy, ptrVal); + mlir::Value isAllocated = rewriter.create( + loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0); + auto ifOp = rewriter.create(loc, std::nullopt, isAllocated, + /*withElseRegion=*/false); + rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); + mlir::Value cast = fir::factory::createConvert( + rewriter, loc, replacement.getType(), ptrVal); + deallocGenerator(loc, rewriter, cast); + // Currently there is no need to reset the pointer var because two + // deallocation points can never be reached without going through the + // alloca. + rewriter.setInsertionPointAfter(ifOp); + }; + for (mlir::Operation *deallocPoint : deallocationPoints) { + rewriter.setInsertionPoint(deallocPoint); + genConditionalDealloc(deallocPoint->getLoc()); + } +} + +bool AllocaReplaceImpl::replace(mlir::RewriterBase &rewriter, + fir::AllocaOp alloca) { + llvm::SmallVector deallocationPoints; + mlir::Region *owningRegion = + findDeallocationPointsAndOwner(alloca, deallocationPoints); + if (!owningRegion) + return false; + rewriter.setInsertionPointAfter(alloca.getOperation()); + bool deallocPointsDominateAlloc = + allocDominatesDealloc(alloca, deallocationPoints); + if (mlir::Value replacement = + allocaRewriter(rewriter, alloca, deallocPointsDominateAlloc)) { + mlir::Value castReplacement = fir::factory::createConvert( + rewriter, alloca.getLoc(), alloca.getType(), replacement); + if (deallocPointsDominateAlloc) + for (mlir::Operation *deallocPoint : deallocationPoints) { + rewriter.setInsertionPoint(deallocPoint); + deallocGenerator(deallocPoint->getLoc(), rewriter, replacement); + } + else + genIndirectDeallocation(rewriter, alloca, deallocationPoints, replacement, + *owningRegion); + rewriter.replaceOp(alloca, castReplacement); + } + return true; +} + +bool fir::replaceAllocas(mlir::RewriterBase &rewriter, + mlir::Operation *parentOp, + MustRewriteCallBack mustReplace, + AllocaRewriterCallBack allocaRewriter, + DeallocCallBack deallocGenerator) { + // If the parent operation is not an alloca owner, the code below would risk + // modifying IR outside of parentOp. + if (!fir::AllocaOp::ownsNestedAlloca(parentOp)) + return false; + auto insertPoint = rewriter.saveInsertionPoint(); + bool replacedAllRequestedAlloca = true; + AllocaReplaceImpl impl(allocaRewriter, deallocGenerator); + parentOp->walk([&](fir::AllocaOp alloca) { + if (mustReplace(alloca)) + replacedAllRequestedAlloca &= impl.replace(rewriter, alloca); + }); + rewriter.restoreInsertionPoint(insertPoint); + return replacedAllRequestedAlloca; +} diff --git a/flang/test/Fir/memory-allocation-opt-2.fir b/flang/test/Fir/memory-allocation-opt-2.fir new file mode 100644 index 000000000000..2addb6ba8b99 --- /dev/null +++ b/flang/test/Fir/memory-allocation-opt-2.fir @@ -0,0 +1,161 @@ +// Test memory allocation pass for fir.alloca outside of function entry block +// RUN: fir-opt --memory-allocation-opt="dynamic-array-on-heap=true" %s | FileCheck %s + +func.func @test_loop() { + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + fir.do_loop %arg0 = %c1 to %c100 step %c1 { + %1 = fir.alloca !fir.array, %arg0 + fir.call @bar(%1) : (!fir.ref>) -> () + fir.result + } + return +} +// CHECK-LABEL: func.func @test_loop() { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 100 : index +// CHECK: fir.do_loop %[[VAL_2:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_0]] { +// CHECK: %[[VAL_3:.*]] = fir.allocmem !fir.array, %[[VAL_2]] {bindc_name = "", uniq_name = ""} +// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.heap>) -> !fir.ref> +// CHECK: fir.call @bar(%[[VAL_4]]) : (!fir.ref>) -> () +// CHECK: fir.freemem %[[VAL_3]] : !fir.heap> +// CHECK: } +// CHECK: return +// CHECK: } + +func.func @test_unstructured(%n : index) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + %0 = fir.alloca index + fir.store %c100 to %0 : !fir.ref + cf.br ^bb1 +^bb1: // 2 preds: ^bb0, ^bb4 + %5 = fir.load %0 : !fir.ref + %6 = arith.cmpi sgt, %5, %c0 : index + cf.cond_br %6, ^bb2, ^bb5 +^bb2: // pred: ^bb1 + %1 = fir.alloca !fir.array, %5 + fir.call @bar(%1) : (!fir.ref>) -> () + %25 = arith.cmpi slt, %5, %n : index + cf.cond_br %25, ^bb3, ^bb4 +^bb3: // pred: ^bb2 + fir.call @abort() : () -> () + fir.unreachable +^bb4: // pred: ^bb2 + %28 = arith.subi %5, %c1 : index + fir.store %28 to %0 : !fir.ref + cf.br ^bb1 +^bb5: // pred: ^bb1 + return +} +// CHECK-LABEL: func.func @test_unstructured( +// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK: %[[VAL_1:.*]] = fir.alloca !fir.heap> +// CHECK: %[[VAL_2:.*]] = fir.zero_bits !fir.heap> +// CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 100 : index +// CHECK: %[[VAL_7:.*]] = fir.alloca index +// CHECK: fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_4]] : index +// CHECK: cf.cond_br %[[VAL_9]], ^bb2, ^bb5 +// CHECK: ^bb2: +// CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_12:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_12]] { +// CHECK: fir.freemem %[[VAL_10]] : !fir.heap> +// CHECK: } +// CHECK: %[[VAL_13:.*]] = fir.allocmem !fir.array, %[[VAL_8]] {bindc_name = "", uniq_name = ""} +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.heap>) -> !fir.ref> +// CHECK: fir.store %[[VAL_13]] to %[[VAL_1]] : !fir.ref>> +// CHECK: fir.call @bar(%[[VAL_14]]) : (!fir.ref>) -> () +// CHECK: %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_8]], %[[VAL_0]] : index +// CHECK: cf.cond_br %[[VAL_15]], ^bb3, ^bb4 +// CHECK: ^bb3: +// CHECK: fir.call @abort() : () -> () +// CHECK: %[[VAL_16:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_18]] { +// CHECK: fir.freemem %[[VAL_16]] : !fir.heap> +// CHECK: } +// CHECK: fir.unreachable +// CHECK: ^bb4: +// CHECK: %[[VAL_19:.*]] = arith.subi %[[VAL_8]], %[[VAL_5]] : index +// CHECK: fir.store %[[VAL_19]] to %[[VAL_7]] : !fir.ref +// CHECK: cf.br ^bb1 +// CHECK: ^bb5: +// CHECK: %[[VAL_20:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_22]] { +// CHECK: fir.freemem %[[VAL_20]] : !fir.heap> +// CHECK: } +// CHECK: return +// CHECK: } + +func.func @alloca_dominate_return_in_cycle(%arg0: index) { + %0 = fir.alloca index + %c1 = arith.constant 1 : index + fir.store %c1 to %0 : !fir.ref + cf.br ^bb1 +^bb1: // 2 preds: ^bb0, ^bb2 + %1 = fir.load %0 : !fir.ref + %2 = fir.alloca !fir.array, %1 + fir.call @bar(%2) : (!fir.ref>) -> () + %3 = arith.addi %1, %c1 : index + fir.store %3 to %0 : !fir.ref + %4 = arith.cmpi slt, %3, %arg0 : index + cf.cond_br %4, ^bb2, ^bb3 +^bb2: // pred: ^bb1 + cf.br ^bb1 +^bb3: // pred: ^bb1 + return +} +// CHECK-LABEL: func.func @alloca_dominate_return_in_cycle( +// CHECK-SAME: %[[VAL_0:.*]]: index) { +// CHECK: %[[VAL_1:.*]] = fir.alloca !fir.heap> +// CHECK: %[[VAL_2:.*]] = fir.zero_bits !fir.heap> +// CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_4:.*]] = fir.alloca index +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: fir.store %[[VAL_5]] to %[[VAL_4]] : !fir.ref +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]] : !fir.ref +// CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_9]] { +// CHECK: fir.freemem %[[VAL_7]] : !fir.heap> +// CHECK: } +// CHECK: %[[VAL_10:.*]] = fir.allocmem !fir.array, %[[VAL_6]] {bindc_name = "", uniq_name = ""} +// CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.heap>) -> !fir.ref> +// CHECK: fir.store %[[VAL_10]] to %[[VAL_1]] : !fir.ref>> +// CHECK: fir.call @bar(%[[VAL_11]]) : (!fir.ref>) -> () +// CHECK: %[[VAL_12:.*]] = arith.addi %[[VAL_6]], %[[VAL_5]] : index +// CHECK: fir.store %[[VAL_12]] to %[[VAL_4]] : !fir.ref +// CHECK: %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_0]] : index +// CHECK: cf.cond_br %[[VAL_13]], ^bb2, ^bb3 +// CHECK: ^bb2: +// CHECK: cf.br ^bb1 +// CHECK: ^bb3: +// CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_1]] : !fir.ref>> +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (!fir.heap>) -> i64 +// CHECK: %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_15]], %[[VAL_3]] : i64 +// CHECK: fir.if %[[VAL_16]] { +// CHECK: fir.freemem %[[VAL_14]] : !fir.heap> +// CHECK: } +// CHECK: return +// CHECK: } + +func.func private @bar(!fir.ref>) +func.func private @abort()