//===- ACCImplicitDeclare.cpp ---------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass applies implicit `acc declare` actions to global variables // referenced in OpenACC compute regions and routine functions. // // Overview: // --------- // Global references in an acc regions (for globals not marked with `acc // declare` by the user) can be handled in one of two ways: // - Mapped through data clauses // - Implicitly marked as `acc declare` (this pass) // // Thus, the OpenACC specification focuses solely on implicit data mapping rules // whose implementation is captured in `ACCImplicitData` pass. // // However, it is both advantageous and required for certain cases to // use implicit `acc declare` instead: // - Any functions that are implicitly marked as `acc routine` through // `ACCImplicitRoutine` may reference globals. Since data mapping // is only possible for compute regions, such globals can only be // made available on device through `acc declare`. // - Compiler can generate and use globals for cases needed in IR // representation such as type descriptors or various names needed for // runtime calls and error reporting - such cases often are introduced // after a frontend semantic checking is done since it is related to // implementation detail. Thus, such compiler generated globals would // not have been visible for a user to mark with `acc declare`. // - Constant globals such as filename strings or data initialization values // are values that do not get mutated but are still needed for appropriate // runtime execution. If a kernel is launched 1000 times, it is not a // good idea to map such a global 1000 times. Therefore, such globals // benefit from being marked with `acc declare`. // // This pass automatically // marks global variables with the `acc.declare` attribute when they are // referenced in OpenACC compute constructs or routine functions and meet // the criteria noted above, ensuring // they are properly handled for device execution. // // The pass performs two main optimizations: // // 1. Hoisting: For non-constant globals referenced in compute regions, the // pass hoists the address-of operation out of the region when possible, // allowing them to be implicitly mapped through normal data clause // mechanisms rather than requiring declare marking. // // 2. Declaration: For globals that must be available on the device (constants, // globals in routines, globals in recipe operations), the pass adds the // `acc.declare` attribute with the copyin data clause. // // Requirements: // ------------- // To use this pass in a pipeline, the following requirements must be met: // // 1. Operation Interface Implementation: Operations that compute addresses // of global variables must implement the `acc::AddressOfGlobalOpInterface` // and those that represent globals must implement the // `acc::GlobalOpInterface`. Additionally, any operations that indirectly // access globals must implement the `acc::IndirectGlobalAccessOpInterface`. // // 2. Analysis Registration (Optional): If custom behavior is needed for // determining if a symbol use is valid within GPU regions, the dialect // should pre-register the `acc::OpenACCSupport` analysis. // // Examples: // --------- // // Example 1: Non-constant global in compute region (hoisted) // // Before: // memref.global @g_scalar : memref = dense<0.0> // func.func @test() { // acc.serial { // %addr = memref.get_global @g_scalar : memref // %val = memref.load %addr[] : memref // acc.yield // } // } // // After: // memref.global @g_scalar : memref = dense<0.0> // func.func @test() { // %addr = memref.get_global @g_scalar : memref // acc.serial { // %val = memref.load %addr[] : memref // acc.yield // } // } // // Example 2: Constant global in compute region (declared) // // Before: // memref.global constant @g_const : memref = dense<1.0> // func.func @test() { // acc.serial { // %addr = memref.get_global @g_const : memref // %val = memref.load %addr[] : memref // acc.yield // } // } // // After: // memref.global constant @g_const : memref = dense<1.0> // {acc.declare = #acc.declare} // func.func @test() { // acc.serial { // %addr = memref.get_global @g_const : memref // %val = memref.load %addr[] : memref // acc.yield // } // } // // Example 3: Global in acc routine (declared) // // Before: // memref.global @g_data : memref = dense<0.0> // acc.routine @routine_0 func(@device_func) // func.func @device_func() attributes {acc.routine_info = ...} { // %addr = memref.get_global @g_data : memref // %val = memref.load %addr[] : memref // } // // After: // memref.global @g_data : memref = dense<0.0> // {acc.declare = #acc.declare} // acc.routine @routine_0 func(@device_func) // func.func @device_func() attributes {acc.routine_info = ...} { // %addr = memref.get_global @g_data : memref // %val = memref.load %addr[] : memref // } // // Example 4: Global in private recipe (declared if recipe is used) // // Before: // memref.global @g_init : memref = dense<0.0> // acc.private.recipe @priv_recipe : memref init { // ^bb0(%arg0: memref): // %alloc = memref.alloc() : memref // %global = memref.get_global @g_init : memref // %val = memref.load %global[] : memref // memref.store %val, %alloc[] : memref // acc.yield %alloc : memref // } destroy { ... } // func.func @test() { // %var = memref.alloc() : memref // %priv = acc.private varPtr(%var : memref) // recipe(@priv_recipe) -> memref // acc.parallel private(%priv : memref) { ... } // } // // After: // memref.global @g_init : memref = dense<0.0> // {acc.declare = #acc.declare} // acc.private.recipe @priv_recipe : memref init { // ^bb0(%arg0: memref): // %alloc = memref.alloc() : memref // %global = memref.get_global @g_init : memref // %val = memref.load %global[] : memref // memref.store %val, %alloc[] : memref // acc.yield %alloc : memref // } destroy { ... } // func.func @test() { // %var = memref.alloc() : memref // %priv = acc.private varPtr(%var : memref) // recipe(@priv_recipe) -> memref // acc.parallel private(%priv : memref) { ... } // } // //===----------------------------------------------------------------------===// #include "mlir/Dialect/OpenACC/Transforms/Passes.h" #include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h" #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" #include "mlir/Interfaces/FunctionInterfaces.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" namespace mlir { namespace acc { #define GEN_PASS_DEF_ACCIMPLICITDECLARE #include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc" } // namespace acc } // namespace mlir #define DEBUG_TYPE "acc-implicit-declare" using namespace mlir; namespace { using GlobalOpSetT = llvm::SmallSetVector; /// Checks whether a use of the requested `globalOp` should be considered /// for hoisting out of acc region due to avoid `acc declare`ing something /// that instead should be implicitly mapped. static bool isGlobalUseCandidateForHoisting(Operation *globalOp, Operation *user, SymbolRefAttr symbol, acc::OpenACCSupport &accSupport) { // This symbol is valid in GPU region. This means semantics // would change if moved to host - therefore it is not a candidate. if (accSupport.isValidSymbolUse(user, symbol)) return false; bool isConstant = false; bool isFunction = false; if (auto globalVarOp = dyn_cast(globalOp)) isConstant = globalVarOp.isConstant(); if (isa(globalOp)) isFunction = true; // Constants should be kept in device code to ensure they are duplicated. // Function references should be kept in device code to ensure their device // addresses are computed. Everything else should be hoisted since we already // proved they are not valid symbols in GPU region. return !isConstant && !isFunction; } /// Checks whether it is valid to use acc.declare marking on the global. bool isValidForAccDeclare(Operation *globalOp) { // For functions - we use acc.routine marking instead. return !isa(globalOp); } /// Checks whether a recipe operation has meaningful use of its symbol that /// justifies processing its regions for global references. Returns false if: /// 1. The recipe has no symbol uses at all, or /// 2. The only symbol use is the recipe's own symbol definition template static bool hasRelevantRecipeUse(RecipeOpT &recipeOp, ModuleOp &mod) { std::optional symbolUses = recipeOp.getSymbolUses(mod); // No recipe symbol uses. if (!symbolUses.has_value() || symbolUses->empty()) return false; // If more than one use, assume it's used. auto begin = symbolUses->begin(); auto end = symbolUses->end(); if (begin != end && std::next(begin) != end) return true; // If single use, check if the use is the recipe itself. const SymbolTable::SymbolUse &use = *symbolUses->begin(); return use.getUser() != recipeOp.getOperation(); } // Hoists addr_of operations for non-constant globals out of OpenACC regions. // This way - they are implicitly mapped instead of being considered for // implicit declare. template static void hoistNonConstantDirectUses(AccConstructT accOp, acc::OpenACCSupport &accSupport) { accOp.walk([&](acc::AddressOfGlobalOpInterface addrOfOp) { SymbolRefAttr symRef = addrOfOp.getSymbol(); if (symRef) { Operation *globalOp = SymbolTable::lookupNearestSymbolFrom(addrOfOp, symRef); if (isGlobalUseCandidateForHoisting(globalOp, addrOfOp, symRef, accSupport)) { addrOfOp->moveBefore(accOp); LLVM_DEBUG( llvm::dbgs() << "Hoisted:\n\t" << addrOfOp << "\n\tfrom:\n\t"; accOp->print(llvm::dbgs(), OpPrintingFlags{}.skipRegions().enableDebugInfo()); llvm::dbgs() << "\n"); } } }); } // Collects the globals referenced in a device region static void collectGlobalsFromDeviceRegion(Region ®ion, GlobalOpSetT &globals, acc::OpenACCSupport &accSupport, SymbolTable &symTab) { region.walk([&](Operation *op) { // 1) Only consider relevant operations which use symbols auto addrOfOp = dyn_cast(op); if (addrOfOp) { SymbolRefAttr symRef = addrOfOp.getSymbol(); // 2) Found an operation which uses the symbol. Next determine if it // is a candidate for `acc declare`. Some of the criteria considered // is whether this symbol is not already a device one (either because // acc declare is already used or this is a CUF global). Operation *globalOp = nullptr; bool isCandidate = !accSupport.isValidSymbolUse(op, symRef, &globalOp); // 3) Add the candidate to the set of globals to be `acc declare`d. if (isCandidate && globalOp && isValidForAccDeclare(globalOp)) globals.insert(globalOp); } else if (auto indirectAccessOp = dyn_cast(op)) { // Process operations that indirectly access globals llvm::SmallVector symbols; indirectAccessOp.getReferencedSymbols(symbols, &symTab); for (SymbolRefAttr symRef : symbols) if (Operation *globalOp = symTab.lookup(symRef.getLeafReference())) if (isValidForAccDeclare(globalOp)) globals.insert(globalOp); } }); } // Adds the declare attribute to the operation `op`. static void addDeclareAttr(MLIRContext *context, Operation *op, acc::DataClause clause) { op->setAttr(acc::getDeclareAttrName(), acc::DeclareAttr::get(context, acc::DataClauseAttr::get(context, clause))); } // This pass applies implicit declare actions for globals referenced in // OpenACC compute and routine regions. class ACCImplicitDeclare : public acc::impl::ACCImplicitDeclareBase { public: using ACCImplicitDeclareBase::ACCImplicitDeclareBase; void runOnOperation() override { ModuleOp mod = getOperation(); MLIRContext *context = &getContext(); acc::OpenACCSupport &accSupport = getAnalysis(); // 1) Start off by hoisting any AddressOf operations out of acc region // for any cases we do not want to `acc declare`. This is because we can // rely on implicit data mapping in majority of cases without uselessly // polluting the device globals. mod.walk([&](Operation *op) { TypeSwitch(op) .Case( [&](auto accOp) { hoistNonConstantDirectUses(accOp, accSupport); }); }); // 2) Collect global symbols which need to be `acc declare`d. Do it for // compute regions, acc routine, and existing globals with the declare // attribute. SymbolTable symTab(mod); GlobalOpSetT globalsToAccDeclare; mod.walk([&](Operation *op) { TypeSwitch(op) .Case( [&](auto accOp) { collectGlobalsFromDeviceRegion( accOp.getRegion(), globalsToAccDeclare, accSupport, symTab); }) .Case([&](FunctionOpInterface func) { if ((acc::isAccRoutine(func) || acc::isSpecializedAccRoutine(func)) && !func.isExternal()) collectGlobalsFromDeviceRegion(func.getFunctionBody(), globalsToAccDeclare, accSupport, symTab); }) .Case([&](acc::GlobalVariableOpInterface globalVarOp) { if (globalVarOp->getAttr(acc::getDeclareAttrName())) if (Region *initRegion = globalVarOp.getInitRegion()) collectGlobalsFromDeviceRegion(*initRegion, globalsToAccDeclare, accSupport, symTab); }) .Case([&](acc::PrivateRecipeOp privateRecipe) { if (hasRelevantRecipeUse(privateRecipe, mod)) { collectGlobalsFromDeviceRegion(privateRecipe.getInitRegion(), globalsToAccDeclare, accSupport, symTab); collectGlobalsFromDeviceRegion(privateRecipe.getDestroyRegion(), globalsToAccDeclare, accSupport, symTab); } }) .Case([&](acc::FirstprivateRecipeOp firstprivateRecipe) { if (hasRelevantRecipeUse(firstprivateRecipe, mod)) { collectGlobalsFromDeviceRegion(firstprivateRecipe.getInitRegion(), globalsToAccDeclare, accSupport, symTab); collectGlobalsFromDeviceRegion( firstprivateRecipe.getDestroyRegion(), globalsToAccDeclare, accSupport, symTab); collectGlobalsFromDeviceRegion(firstprivateRecipe.getCopyRegion(), globalsToAccDeclare, accSupport, symTab); } }) .Case([&](acc::ReductionRecipeOp reductionRecipe) { if (hasRelevantRecipeUse(reductionRecipe, mod)) { collectGlobalsFromDeviceRegion(reductionRecipe.getInitRegion(), globalsToAccDeclare, accSupport, symTab); collectGlobalsFromDeviceRegion( reductionRecipe.getCombinerRegion(), globalsToAccDeclare, accSupport, symTab); } }); }); // 3) Finally, generate the appropriate declare actions needed to ensure // this is considered for device global. for (Operation *globalOp : globalsToAccDeclare) { LLVM_DEBUG( llvm::dbgs() << "Global is being `acc declare copyin`d: "; globalOp->print(llvm::dbgs(), OpPrintingFlags{}.skipRegions().enableDebugInfo()); llvm::dbgs() << "\n"); // Mark it as declare copyin. addDeclareAttr(context, globalOp, acc::DataClause::acc_copyin); // TODO: May need to create the global constructor which does the mapping // action. It is not yet clear if this is needed yet (since the globals // might just end up in the GPU image without requiring mapping via // runtime). } } }; } // namespace