
When multiple assumed size variable are used in a kernel with dynamic shared memory, each variable use the 0 offset. Update the pass to account for that. ``` attributes(global) subroutine testany( a ) real(4), shared :: smasks(*) real(8), shared :: dmasks(*) end subroutine ```
164 lines
6.6 KiB
C++
164 lines
6.6 KiB
C++
//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "flang/Optimizer/Builder/BoxValue.h"
|
|
#include "flang/Optimizer/Builder/CUFCommon.h"
|
|
#include "flang/Optimizer/Builder/FIRBuilder.h"
|
|
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
|
|
#include "flang/Optimizer/Builder/Todo.h"
|
|
#include "flang/Optimizer/CodeGen/Target.h"
|
|
#include "flang/Optimizer/CodeGen/TypeConverter.h"
|
|
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
|
|
#include "flang/Optimizer/Dialect/FIRAttr.h"
|
|
#include "flang/Optimizer/Dialect/FIRDialect.h"
|
|
#include "flang/Optimizer/Dialect/FIROps.h"
|
|
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
|
|
#include "flang/Optimizer/Dialect/FIRType.h"
|
|
#include "flang/Optimizer/Support/DataLayout.h"
|
|
#include "flang/Runtime/CUDA/registration.h"
|
|
#include "flang/Runtime/entry-names.h"
|
|
#include "mlir/Dialect/DLTI/DLTI.h"
|
|
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
|
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
|
#include "mlir/IR/Value.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
namespace fir {
|
|
#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
|
|
#include "flang/Optimizer/Transforms/Passes.h.inc"
|
|
} // namespace fir
|
|
|
|
using namespace Fortran::runtime::cuda;
|
|
|
|
namespace {
|
|
|
|
static bool isAssumedSize(mlir::ValueRange shape) {
|
|
if (shape.size() != 1)
|
|
return false;
|
|
std::optional<std::int64_t> val = fir::getIntIfConstant(shape[0]);
|
|
if (val && *val == -1)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
struct CUFComputeSharedMemoryOffsetsAndSize
|
|
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
|
|
CUFComputeSharedMemoryOffsetsAndSize> {
|
|
|
|
void runOnOperation() override {
|
|
mlir::ModuleOp mod = getOperation();
|
|
mlir::SymbolTable symTab(mod);
|
|
mlir::OpBuilder opBuilder{mod.getBodyRegion()};
|
|
fir::FirOpBuilder builder(opBuilder, mod);
|
|
fir::KindMapping kindMap{fir::getKindMapping(mod)};
|
|
std::optional<mlir::DataLayout> dl =
|
|
fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
|
|
if (!dl) {
|
|
mlir::emitError(mod.getLoc(),
|
|
"data layout attribute is required to perform " +
|
|
getName() + "pass");
|
|
}
|
|
|
|
auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
|
|
mlir::Type i8Ty = builder.getI8Type();
|
|
mlir::Type i32Ty = builder.getI32Type();
|
|
mlir::Type idxTy = builder.getIndexType();
|
|
for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
|
|
unsigned nbDynamicSharedVariables = 0;
|
|
unsigned nbStaticSharedVariables = 0;
|
|
uint64_t sharedMemSize = 0;
|
|
unsigned short alignment = 0;
|
|
mlir::Value crtDynOffset;
|
|
|
|
// Go over each shared memory operation and compute their start offset and
|
|
// the size and alignment of the global to be generated if all variables
|
|
// are static. If this is dynamic shared memory, then only the alignment
|
|
// is computed.
|
|
for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
|
|
mlir::Location loc = sharedOp.getLoc();
|
|
builder.setInsertionPoint(sharedOp);
|
|
if (fir::hasDynamicSize(sharedOp.getInType())) {
|
|
mlir::Type ty = sharedOp.getInType();
|
|
if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
|
|
ty = seqTy.getEleTy();
|
|
unsigned short align = dl->getTypeABIAlignment(ty);
|
|
alignment = std::max(alignment, align);
|
|
uint64_t tySize = dl->getTypeSize(ty);
|
|
++nbDynamicSharedVariables;
|
|
if (isAssumedSize(sharedOp.getShape()) || !crtDynOffset) {
|
|
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
|
|
sharedOp.getOffsetMutable().assign(zero);
|
|
} else {
|
|
sharedOp.getOffsetMutable().assign(
|
|
builder.createConvert(loc, i32Ty, crtDynOffset));
|
|
}
|
|
|
|
mlir::Value dynSize =
|
|
builder.createIntegerConstant(loc, idxTy, tySize);
|
|
for (auto extent : sharedOp.getShape())
|
|
dynSize =
|
|
mlir::arith::MulIOp::create(builder, loc, dynSize, extent);
|
|
if (crtDynOffset)
|
|
crtDynOffset = mlir::arith::AddIOp::create(builder, loc,
|
|
crtDynOffset, dynSize);
|
|
else
|
|
crtDynOffset = dynSize;
|
|
|
|
continue;
|
|
}
|
|
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
|
|
sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
|
|
++nbStaticSharedVariables;
|
|
mlir::Value offset = builder.createIntegerConstant(
|
|
loc, i32Ty, llvm::alignTo(sharedMemSize, align));
|
|
sharedOp.getOffsetMutable().assign(offset);
|
|
sharedMemSize =
|
|
llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
|
|
alignment = std::max(alignment, align);
|
|
}
|
|
|
|
if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
|
|
continue;
|
|
|
|
if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
|
|
mlir::emitError(
|
|
funcOp.getLoc(),
|
|
"static and dynamic shared variables in a single kernel");
|
|
|
|
mlir::DenseElementsAttr init = {};
|
|
if (sharedMemSize > 0) {
|
|
auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
|
|
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
|
|
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
|
|
}
|
|
|
|
// Create the shared memory global where each shared variable will point
|
|
// to.
|
|
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
|
|
std::string sharedMemGlobalName =
|
|
(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
|
|
mlir::StringAttr linkage = builder.createInternalLinkage();
|
|
builder.setInsertionPointToEnd(gpuMod.getBody());
|
|
llvm::SmallVector<mlir::NamedAttribute> attrs;
|
|
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
|
|
gpuMod.getContext());
|
|
attrs.push_back(mlir::NamedAttribute(
|
|
fir::GlobalOp::getDataAttrAttrName(globalOpName),
|
|
cuf::DataAttributeAttr::get(gpuMod.getContext(),
|
|
cuf::DataAttribute::Shared)));
|
|
auto sharedMem = fir::GlobalOp::create(
|
|
builder, funcOp.getLoc(), sharedMemGlobalName, false, false,
|
|
sharedMemType, init, linkage, attrs);
|
|
sharedMem.setAlignment(alignment);
|
|
}
|
|
}
|
|
};
|
|
|
|
} // end anonymous namespace
|