//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/CodeGen/Target.h" #include "flang/Optimizer/CodeGen/TypeConverter.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIRAttr.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Support/DataLayout.h" #include "flang/Runtime/CUDA/registration.h" #include "flang/Runtime/entry-names.h" #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Value.h" #include "mlir/Pass/Pass.h" #include "llvm/ADT/SmallVector.h" namespace fir { #define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE #include "flang/Optimizer/Transforms/Passes.h.inc" } // namespace fir using namespace Fortran::runtime::cuda; namespace { static bool isAssumedSize(mlir::ValueRange shape) { if (shape.size() != 1) return false; std::optional val = fir::getIntIfConstant(shape[0]); if (val && *val == -1) return true; return false; } struct CUFComputeSharedMemoryOffsetsAndSize : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase< CUFComputeSharedMemoryOffsetsAndSize> { void runOnOperation() override { mlir::ModuleOp mod = getOperation(); mlir::SymbolTable symTab(mod); mlir::OpBuilder opBuilder{mod.getBodyRegion()}; fir::FirOpBuilder builder(opBuilder, mod); fir::KindMapping kindMap{fir::getKindMapping(mod)}; std::optional dl = fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false); if (!dl) { mlir::emitError(mod.getLoc(), "data layout attribute is required to perform " + getName() + "pass"); } auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab); mlir::Type i8Ty = builder.getI8Type(); mlir::Type i32Ty = builder.getI32Type(); mlir::Type idxTy = builder.getIndexType(); for (auto funcOp : gpuMod.getOps()) { unsigned nbDynamicSharedVariables = 0; unsigned nbStaticSharedVariables = 0; uint64_t sharedMemSize = 0; unsigned short alignment = 0; mlir::Value crtDynOffset; // Go over each shared memory operation and compute their start offset and // the size and alignment of the global to be generated if all variables // are static. If this is dynamic shared memory, then only the alignment // is computed. for (auto sharedOp : funcOp.getOps()) { mlir::Location loc = sharedOp.getLoc(); builder.setInsertionPoint(sharedOp); if (fir::hasDynamicSize(sharedOp.getInType())) { mlir::Type ty = sharedOp.getInType(); if (auto seqTy = mlir::dyn_cast(ty)) ty = seqTy.getEleTy(); unsigned short align = dl->getTypeABIAlignment(ty); alignment = std::max(alignment, align); uint64_t tySize = dl->getTypeSize(ty); ++nbDynamicSharedVariables; if (isAssumedSize(sharedOp.getShape()) || !crtDynOffset) { mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0); sharedOp.getOffsetMutable().assign(zero); } else { sharedOp.getOffsetMutable().assign( builder.createConvert(loc, i32Ty, crtDynOffset)); } mlir::Value dynSize = builder.createIntegerConstant(loc, idxTy, tySize); for (auto extent : sharedOp.getShape()) dynSize = mlir::arith::MulIOp::create(builder, loc, dynSize, extent); if (crtDynOffset) crtDynOffset = mlir::arith::AddIOp::create(builder, loc, crtDynOffset, dynSize); else crtDynOffset = dynSize; continue; } auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash( sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap); ++nbStaticSharedVariables; mlir::Value offset = builder.createIntegerConstant( loc, i32Ty, llvm::alignTo(sharedMemSize, align)); sharedOp.getOffsetMutable().assign(offset); sharedMemSize = llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align); alignment = std::max(alignment, align); } if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0) continue; if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0) mlir::emitError( funcOp.getLoc(), "static and dynamic shared variables in a single kernel"); mlir::DenseElementsAttr init = {}; if (sharedMemSize > 0) { auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty); mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0); init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero)); } // Create the shared memory global where each shared variable will point // to. auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty); std::string sharedMemGlobalName = (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str(); mlir::StringAttr linkage = builder.createInternalLinkage(); builder.setInsertionPointToEnd(gpuMod.getBody()); llvm::SmallVector attrs; auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(), gpuMod.getContext()); attrs.push_back(mlir::NamedAttribute( fir::GlobalOp::getDataAttrAttrName(globalOpName), cuf::DataAttributeAttr::get(gpuMod.getContext(), cuf::DataAttribute::Shared))); auto sharedMem = fir::GlobalOp::create( builder, funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType, init, linkage, attrs); sharedMem.setAlignment(alignment); } } }; } // end anonymous namespace