When acc.par_width was introduced in https://github.com/llvm/llvm-project/pull/184864 there was a discussion on whether to use index or create a new type for the output of the operation. It was decided to create a new type; but this means that launch arguments cannot be used directly in the region such as for loop bounds without a conversion from the new type to index. In order to avoid the casting operations (and introduction of an actual operation to do this cast), simply restore acc.par_width to generate index type. This allows its result to be directly used in acc.compute_region.
798 lines
27 KiB
C++
798 lines
27 KiB
C++
//===- OpenACCCG.cpp - OpenACC codegen ops, attributes, and types ---------===//
|
|
//
|
|
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Implementation for OpenACC codegen operations, attributes, and types.
|
|
// These correspond to the definitions in OpenACCCG*.td tablegen files
|
|
// and are kept in a separate file because they do not represent direct mappings
|
|
// of OpenACC language constructs; they are intermediate representations used
|
|
// when decomposing and lowering primary `acc` dialect operations.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
|
#include "mlir/Dialect/OpenACC/OpenACC.h"
|
|
#include "mlir/Dialect/Utils/StaticValueUtils.h"
|
|
#include "mlir/IR/BuiltinAttributes.h"
|
|
#include "mlir/IR/BuiltinTypes.h"
|
|
#include "mlir/IR/Region.h"
|
|
#include "mlir/Interfaces/ControlFlowInterfaces.h"
|
|
#include "mlir/Support/LogicalResult.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
|
|
using namespace mlir;
|
|
using namespace acc;
|
|
|
|
namespace {
|
|
|
|
/// Generic helper for single-region OpenACC ops that execute their body once
|
|
/// and then return to the parent operation with their results (if any).
|
|
static void
|
|
getSingleRegionOpSuccessorRegions(Operation *op, Region ®ion,
|
|
RegionBranchPoint point,
|
|
SmallVectorImpl<RegionSuccessor> ®ions) {
|
|
if (point.isParent()) {
|
|
regions.push_back(RegionSuccessor(®ion));
|
|
return;
|
|
}
|
|
regions.push_back(RegionSuccessor::parent());
|
|
}
|
|
|
|
static ValueRange getSingleRegionSuccessorInputs(Operation *op,
|
|
RegionSuccessor successor) {
|
|
return successor.isParent() ? ValueRange(op->getResults()) : ValueRange();
|
|
}
|
|
|
|
/// Remove empty acc.kernel_environment operations. If the operation has wait
|
|
/// operands, create a acc.wait operation to preserve synchronization.
|
|
struct RemoveEmptyKernelEnvironment
|
|
: public OpRewritePattern<acc::KernelEnvironmentOp> {
|
|
using OpRewritePattern<acc::KernelEnvironmentOp>::OpRewritePattern;
|
|
|
|
LogicalResult matchAndRewrite(acc::KernelEnvironmentOp op,
|
|
PatternRewriter &rewriter) const override {
|
|
assert(op->getNumRegions() == 1 && "expected op to have one region");
|
|
|
|
Block &block = op.getRegion().front();
|
|
if (!block.empty())
|
|
return failure();
|
|
|
|
// Conservatively disable canonicalization of empty acc.kernel_environment
|
|
// operations if the wait operands in the kernel_environment cannot be fully
|
|
// represented by acc.wait operation.
|
|
|
|
// Disable canonicalization if device type is not the default
|
|
if (auto deviceTypeAttr = op.getWaitOperandsDeviceTypeAttr()) {
|
|
for (auto attr : deviceTypeAttr) {
|
|
if (auto dtAttr = mlir::dyn_cast<acc::DeviceTypeAttr>(attr)) {
|
|
if (dtAttr.getValue() != mlir::acc::DeviceType::None)
|
|
return failure();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Disable canonicalization if any wait segment has a devnum
|
|
if (auto hasDevnumAttr = op.getHasWaitDevnumAttr()) {
|
|
for (auto attr : hasDevnumAttr) {
|
|
if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(attr)) {
|
|
if (boolAttr.getValue())
|
|
return failure();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Disable canonicalization if there are multiple wait segments
|
|
if (auto segmentsAttr = op.getWaitOperandsSegmentsAttr()) {
|
|
if (segmentsAttr.size() > 1)
|
|
return failure();
|
|
}
|
|
|
|
// Remove empty kernel environment.
|
|
// Preserve synchronization by creating acc.wait operation if needed.
|
|
if (!op.getWaitOperands().empty() || op.getWaitOnlyAttr())
|
|
rewriter.replaceOpWithNewOp<acc::WaitOp>(op, op.getWaitOperands(),
|
|
/*asyncOperand=*/Value(),
|
|
/*waitDevnum=*/Value(),
|
|
/*async=*/nullptr,
|
|
/*ifCond=*/Value());
|
|
else
|
|
rewriter.eraseOp(op);
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
template <typename EffectTy>
|
|
static void addOperandEffect(
|
|
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
|
|
&effects,
|
|
const MutableOperandRange &operand) {
|
|
for (unsigned i = 0, e = operand.size(); i < e; ++i)
|
|
effects.emplace_back(EffectTy::get(), &operand[i]);
|
|
}
|
|
|
|
template <typename EffectTy>
|
|
static void addResultEffect(
|
|
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
|
|
&effects,
|
|
Value result) {
|
|
effects.emplace_back(EffectTy::get(), mlir::cast<mlir::OpResult>(result));
|
|
}
|
|
|
|
static int64_t gpuProcessorIndex(gpu::Processor p) {
|
|
switch (p) {
|
|
case gpu::Processor::Sequential:
|
|
return 0;
|
|
case gpu::Processor::ThreadX:
|
|
return 1;
|
|
case gpu::Processor::ThreadY:
|
|
return 2;
|
|
case gpu::Processor::ThreadZ:
|
|
return 3;
|
|
case gpu::Processor::BlockX:
|
|
return 4;
|
|
case gpu::Processor::BlockY:
|
|
return 5;
|
|
case gpu::Processor::BlockZ:
|
|
return 6;
|
|
}
|
|
llvm_unreachable("unhandled gpu::Processor");
|
|
}
|
|
|
|
static gpu::Processor indexToGpuProcessor(int64_t idx) {
|
|
switch (idx) {
|
|
case 0:
|
|
return gpu::Processor::Sequential;
|
|
case 1:
|
|
return gpu::Processor::ThreadX;
|
|
case 2:
|
|
return gpu::Processor::ThreadY;
|
|
case 3:
|
|
return gpu::Processor::ThreadZ;
|
|
case 4:
|
|
return gpu::Processor::BlockX;
|
|
case 5:
|
|
return gpu::Processor::BlockY;
|
|
case 6:
|
|
return gpu::Processor::BlockZ;
|
|
default:
|
|
return gpu::Processor::Sequential;
|
|
}
|
|
}
|
|
|
|
static GPUParallelDimAttr intToParDim(MLIRContext *context, int64_t dimInt) {
|
|
return GPUParallelDimAttr::get(
|
|
context, IntegerAttr::get(IndexType::get(context), dimInt));
|
|
}
|
|
|
|
static GPUParallelDimAttr processorParDim(MLIRContext *context,
|
|
gpu::Processor proc) {
|
|
return GPUParallelDimAttr::get(
|
|
context,
|
|
IntegerAttr::get(IndexType::get(context), gpuProcessorIndex(proc)));
|
|
}
|
|
|
|
static ParseResult parseProcessorValue(AsmParser &parser,
|
|
GPUParallelDimAttr &dim) {
|
|
std::string keyword;
|
|
llvm::SMLoc loc = parser.getCurrentLocation();
|
|
if (failed(parser.parseKeywordOrString(&keyword)))
|
|
return failure();
|
|
auto maybeProcessor = gpu::symbolizeProcessor(keyword);
|
|
if (!maybeProcessor)
|
|
return parser.emitError(loc)
|
|
<< "expected one of ::mlir::gpu::Processor enum names";
|
|
dim = intToParDim(parser.getContext(), gpuProcessorIndex(*maybeProcessor));
|
|
return success();
|
|
}
|
|
|
|
static void printProcessorValue(AsmPrinter &printer,
|
|
const GPUParallelDimAttr &attr) {
|
|
gpu::Processor processor = indexToGpuProcessor(attr.getValue().getInt());
|
|
printer << gpu::stringifyProcessor(processor);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// KernelEnvironmentOp
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
void KernelEnvironmentOp::getSuccessorRegions(
|
|
RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> ®ions) {
|
|
getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
|
|
regions);
|
|
}
|
|
|
|
ValueRange KernelEnvironmentOp::getSuccessorInputs(RegionSuccessor successor) {
|
|
return getSingleRegionSuccessorInputs(getOperation(), successor);
|
|
}
|
|
|
|
void KernelEnvironmentOp::getCanonicalizationPatterns(
|
|
RewritePatternSet &results, MLIRContext *context) {
|
|
results.add<RemoveEmptyKernelEnvironment>(context);
|
|
}
|
|
|
|
template <typename ComputeConstructT>
|
|
KernelEnvironmentOp
|
|
KernelEnvironmentOp::createAndPopulate(ComputeConstructT computeConstruct,
|
|
OpBuilder &builder) {
|
|
auto kernelEnvironment = KernelEnvironmentOp::create(
|
|
builder, computeConstruct->getLoc(),
|
|
computeConstruct.getDataClauseOperands(),
|
|
computeConstruct.getAsyncOperands(),
|
|
computeConstruct.getAsyncOperandsDeviceTypeAttr(),
|
|
computeConstruct.getAsyncOnlyAttr(), computeConstruct.getWaitOperands(),
|
|
computeConstruct.getWaitOperandsSegmentsAttr(),
|
|
computeConstruct.getWaitOperandsDeviceTypeAttr(),
|
|
computeConstruct.getHasWaitDevnumAttr(),
|
|
computeConstruct.getWaitOnlyAttr());
|
|
Block &block = kernelEnvironment.getRegion().emplaceBlock();
|
|
builder.setInsertionPointToStart(&block);
|
|
return kernelEnvironment;
|
|
}
|
|
|
|
template KernelEnvironmentOp
|
|
KernelEnvironmentOp::createAndPopulate<ParallelOp>(ParallelOp, OpBuilder &);
|
|
template KernelEnvironmentOp
|
|
KernelEnvironmentOp::createAndPopulate<KernelsOp>(KernelsOp, OpBuilder &);
|
|
template KernelEnvironmentOp
|
|
KernelEnvironmentOp::createAndPopulate<SerialOp>(SerialOp, OpBuilder &);
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// FirstprivateMapInitialOp
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
LogicalResult FirstprivateMapInitialOp::verify() {
|
|
if (getDataClause() != acc::DataClause::acc_firstprivate)
|
|
return emitError("data clause associated with firstprivate operation must "
|
|
"match its intent");
|
|
if (!getVar())
|
|
return emitError("must have var operand");
|
|
if (!mlir::isa<mlir::acc::PointerLikeType>(getVar().getType()) &&
|
|
!mlir::isa<mlir::acc::MappableType>(getVar().getType()))
|
|
return emitError("var must be mappable or pointer-like");
|
|
if (mlir::isa<mlir::acc::PointerLikeType>(getVar().getType()) &&
|
|
getVarType() == getVar().getType())
|
|
return emitError("varType must capture the element type of var");
|
|
if (getModifiers() != acc::DataClauseModifier::none)
|
|
return emitError("no data clause modifiers are allowed");
|
|
return success();
|
|
}
|
|
|
|
void FirstprivateMapInitialOp::getEffects(
|
|
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
|
|
&effects) {
|
|
effects.emplace_back(MemoryEffects::Read::get(),
|
|
acc::CurrentDeviceIdResource::get());
|
|
addOperandEffect<MemoryEffects::Read>(effects, getVarMutable());
|
|
addResultEffect<MemoryEffects::Write>(effects, getAccVar());
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// ReductionInitOp
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
void ReductionInitOp::getSuccessorRegions(
|
|
RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> ®ions) {
|
|
getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
|
|
regions);
|
|
}
|
|
|
|
void ReductionInitOp::getRegionInvocationBounds(
|
|
ArrayRef<Attribute> operands,
|
|
SmallVectorImpl<InvocationBounds> &invocationBounds) {
|
|
invocationBounds.emplace_back(1, 1);
|
|
}
|
|
|
|
ValueRange ReductionInitOp::getSuccessorInputs(RegionSuccessor successor) {
|
|
return getSingleRegionSuccessorInputs(getOperation(), successor);
|
|
}
|
|
|
|
LogicalResult ReductionInitOp::verify() {
|
|
Block &block = getRegion().front();
|
|
if (auto yieldOp = dyn_cast<acc::YieldOp>(block.getTerminator())) {
|
|
if (yieldOp.getNumOperands() != 1)
|
|
return emitOpError(
|
|
"region must yield exactly one value (private storage)");
|
|
if (yieldOp.getOperand(0).getType() != getVar().getType())
|
|
return emitOpError("yielded value type must match var type");
|
|
}
|
|
return success();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// ReductionCombineRegionOp
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
void ReductionCombineRegionOp::getSuccessorRegions(
|
|
RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> ®ions) {
|
|
getSingleRegionOpSuccessorRegions(getOperation(), getRegion(), point,
|
|
regions);
|
|
}
|
|
|
|
void ReductionCombineRegionOp::getRegionInvocationBounds(
|
|
ArrayRef<Attribute> operands,
|
|
SmallVectorImpl<InvocationBounds> &invocationBounds) {
|
|
invocationBounds.emplace_back(1, 1);
|
|
}
|
|
|
|
ValueRange
|
|
ReductionCombineRegionOp::getSuccessorInputs(RegionSuccessor successor) {
|
|
return getSingleRegionSuccessorInputs(getOperation(), successor);
|
|
}
|
|
|
|
LogicalResult ReductionCombineRegionOp::verify() {
|
|
Block &block = getRegion().front();
|
|
if (auto yieldOp = dyn_cast<acc::YieldOp>(block.getTerminator())) {
|
|
if (yieldOp.getNumOperands() != 0)
|
|
return emitOpError("region must be terminated by acc.yield with no "
|
|
"operands");
|
|
}
|
|
return success();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// ReductionCombineOp
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
void ReductionCombineOp::getEffects(
|
|
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
|
|
&effects) {
|
|
effects.emplace_back(MemoryEffects::Read::get(), &getSrcMemrefMutable(),
|
|
SideEffects::DefaultResource::get());
|
|
effects.emplace_back(MemoryEffects::Read::get(), &getDestMemrefMutable(),
|
|
SideEffects::DefaultResource::get());
|
|
effects.emplace_back(MemoryEffects::Write::get(), &getDestMemrefMutable(),
|
|
SideEffects::DefaultResource::get());
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// ComputeRegionOp
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
static ParWidthOp getParWidthOpForLaunchArg(ComputeRegionOp op,
|
|
GPUParallelDimAttr parDim) {
|
|
for (auto launchArg : op.getLaunchArgs()) {
|
|
auto parOp = launchArg.getDefiningOp<ParWidthOp>();
|
|
if (!parOp)
|
|
continue;
|
|
auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
|
|
if (launchArgDim == parDim)
|
|
return parOp;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
std::optional<Value> ComputeRegionOp::getLaunchArg(GPUParallelDimAttr parDim) {
|
|
if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
|
|
return parWidthOp.getResult();
|
|
return {};
|
|
}
|
|
|
|
std::optional<Value>
|
|
ComputeRegionOp::getKnownLaunchArg(GPUParallelDimAttr parDim) {
|
|
if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
|
|
if (parWidthOp.getLaunchArg())
|
|
return parWidthOp.getLaunchArg();
|
|
return {};
|
|
}
|
|
|
|
std::optional<uint64_t>
|
|
ComputeRegionOp::getKnownConstantLaunchArg(GPUParallelDimAttr parDim) {
|
|
auto knownParWidth = getKnownLaunchArg(parDim);
|
|
if (knownParWidth.has_value())
|
|
return getConstantIntValue(knownParWidth.value());
|
|
return {};
|
|
}
|
|
|
|
BlockArgument ComputeRegionOp::appendInputArg(Value value) {
|
|
getInputArgsMutable().append(value);
|
|
return getBody()->addArgument(value.getType(), getLoc());
|
|
}
|
|
|
|
bool ComputeRegionOp::isEffectivelySerial() {
|
|
auto *ctx = getContext();
|
|
|
|
if (getLaunchArg(GPUParallelDimAttr::seqDim(ctx)))
|
|
return true;
|
|
|
|
auto checkDim = [&](GPUParallelDimAttr dim) -> bool {
|
|
auto val = getKnownConstantLaunchArg(dim);
|
|
return val && *val == 1;
|
|
};
|
|
|
|
return checkDim(GPUParallelDimAttr::threadXDim(ctx)) &&
|
|
checkDim(GPUParallelDimAttr::threadYDim(ctx)) &&
|
|
checkDim(GPUParallelDimAttr::threadZDim(ctx)) &&
|
|
checkDim(GPUParallelDimAttr::blockXDim(ctx)) &&
|
|
checkDim(GPUParallelDimAttr::blockYDim(ctx)) &&
|
|
checkDim(GPUParallelDimAttr::blockZDim(ctx));
|
|
}
|
|
|
|
BlockArgument ComputeRegionOp::parDimToWidth(GPUParallelDimAttr parDim) {
|
|
for (auto [pos, launchArg] : llvm::enumerate(getLaunchArgs())) {
|
|
auto parOp = launchArg.getDefiningOp<ParWidthOp>();
|
|
assert(parOp);
|
|
auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
|
|
if (launchArgDim == parDim) {
|
|
assert(pos < getRegion().front().getNumArguments() &&
|
|
"launch arg position out of range");
|
|
return getRegion().front().getArgument(pos);
|
|
}
|
|
}
|
|
llvm_unreachable("attempting to get unspecified parDim");
|
|
}
|
|
|
|
SmallVector<GPUParallelDimAttr> ComputeRegionOp::getLaunchParDims() {
|
|
SmallVector<GPUParallelDimAttr> parDims;
|
|
for (auto launchArg : getLaunchArgs()) {
|
|
auto parOp = launchArg.getDefiningOp<ParWidthOp>();
|
|
auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
|
|
int64_t dimInt = launchArgDim.getValue().getInt();
|
|
parDims.push_back(intToParDim(getContext(), dimInt));
|
|
}
|
|
return parDims;
|
|
}
|
|
|
|
Value ComputeRegionOp::getOperand(BlockArgument blockArg) {
|
|
unsigned argNumber = blockArg.getArgNumber();
|
|
unsigned numLaunchArgs = getLaunchArgs().size();
|
|
assert(argNumber < (numLaunchArgs + getInputArgs().size()) &&
|
|
"invalid block argument");
|
|
if (argNumber < numLaunchArgs)
|
|
return getLaunchArgs()[argNumber];
|
|
return getInputArgs()[argNumber - numLaunchArgs];
|
|
}
|
|
|
|
BlockArgument ComputeRegionOp::gpuParWidth(gpu::Processor processor) {
|
|
return parDimToWidth(GPUParallelDimAttr::get(getContext(), processor));
|
|
}
|
|
|
|
LogicalResult ComputeRegionOp::verify() {
|
|
for (auto op : getLaunchArgs())
|
|
if (!op.getDefiningOp<acc::ParWidthOp>())
|
|
return emitOpError(
|
|
"launch arguments must be results of acc.par_width operations");
|
|
|
|
unsigned expectedBlockArgs = getLaunchArgs().size() + getInputArgs().size();
|
|
unsigned actualBlockArgs = getRegion().front().getNumArguments();
|
|
if (expectedBlockArgs != actualBlockArgs)
|
|
return emitOpError("expected ")
|
|
<< expectedBlockArgs << " block arguments (launch + input), got "
|
|
<< actualBlockArgs;
|
|
|
|
return success();
|
|
}
|
|
|
|
void ComputeRegionOp::print(OpAsmPrinter &p) {
|
|
ValueRange regionArgs = getBody()->getArguments();
|
|
ValueRange launchArgs = getLaunchArgs();
|
|
ValueRange inputArgs = getInputArgs();
|
|
|
|
assert(regionArgs.size() == (launchArgs.size() + inputArgs.size()) &&
|
|
"region args mismatch");
|
|
|
|
if (getStream())
|
|
p << " stream(" << getStream() << " : " << getStream().getType() << ")";
|
|
|
|
size_t i = 0;
|
|
if (!launchArgs.empty()) {
|
|
p << " launch(";
|
|
for (size_t j = 0; j < launchArgs.size(); ++j, ++i) {
|
|
p << regionArgs[i] << " = " << launchArgs[j];
|
|
if (j < launchArgs.size() - 1)
|
|
p << ", ";
|
|
}
|
|
p << ")";
|
|
}
|
|
if (!inputArgs.empty()) {
|
|
p << " ins(";
|
|
for (size_t j = 0; j < inputArgs.size(); ++j, ++i) {
|
|
p << regionArgs[i] << " = " << inputArgs[j];
|
|
if (j < inputArgs.size() - 1)
|
|
p << ", ";
|
|
}
|
|
p << ") : (";
|
|
for (size_t j = 0; j < inputArgs.size(); ++j) {
|
|
p << inputArgs[j].getType();
|
|
if (j < inputArgs.size() - 1)
|
|
p << ", ";
|
|
}
|
|
p << ")";
|
|
}
|
|
p.printOptionalArrowTypeList(getResultTypes());
|
|
p << " ";
|
|
p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
|
|
p.printOptionalAttrDict((*this)->getAttrs(),
|
|
/*elidedAttrs=*/getOperandSegmentSizeAttr());
|
|
}
|
|
|
|
ParseResult ComputeRegionOp::parse(OpAsmParser &parser,
|
|
OperationState &result) {
|
|
auto &builder = parser.getBuilder();
|
|
|
|
SmallVector<OpAsmParser::Argument> regionArgs;
|
|
OpAsmParser::UnresolvedOperand streamOperand;
|
|
Type streamType;
|
|
SmallVector<OpAsmParser::UnresolvedOperand> launchOperands;
|
|
SmallVector<OpAsmParser::UnresolvedOperand> inputOperands;
|
|
SmallVector<Type> types;
|
|
|
|
bool hasStream = false;
|
|
if (succeeded(parser.parseOptionalKeyword("stream"))) {
|
|
hasStream = true;
|
|
if (parser.parseLParen() || parser.parseOperand(streamOperand) ||
|
|
parser.parseColon() || parser.parseType(streamType) ||
|
|
parser.parseRParen())
|
|
return failure();
|
|
}
|
|
|
|
if (succeeded(parser.parseOptionalKeyword("launch"))) {
|
|
if (parser.parseAssignmentList(regionArgs, launchOperands))
|
|
return failure();
|
|
Type indexType = builder.getIndexType();
|
|
for (size_t i = 0; i < regionArgs.size(); ++i)
|
|
types.push_back(indexType);
|
|
}
|
|
|
|
if (succeeded(parser.parseOptionalKeyword("ins"))) {
|
|
if (parser.parseAssignmentList(regionArgs, inputOperands) ||
|
|
parser.parseColon() || parser.parseLParen() ||
|
|
parser.parseTypeList(types) || parser.parseRParen())
|
|
return failure();
|
|
}
|
|
|
|
if (parser.parseOptionalArrowTypeList(result.types))
|
|
return failure();
|
|
|
|
for (auto [iterArg, type] : llvm::zip_equal(regionArgs, types))
|
|
iterArg.type = type;
|
|
|
|
Region *body = result.addRegion();
|
|
if (parser.parseRegion(*body, regionArgs))
|
|
return failure();
|
|
|
|
const size_t numLaunchOperands = launchOperands.size();
|
|
const size_t numInputOperands = inputOperands.size();
|
|
assert(numLaunchOperands + numInputOperands == regionArgs.size() &&
|
|
"compute region args mismatch");
|
|
|
|
result.addAttribute(
|
|
ComputeRegionOp::getOperandSegmentSizeAttr(),
|
|
builder.getDenseI32ArrayAttr({static_cast<int32_t>(numLaunchOperands),
|
|
static_cast<int32_t>(numInputOperands),
|
|
hasStream ? 1 : 0}));
|
|
|
|
for (size_t i = 0; i < numLaunchOperands; ++i) {
|
|
if (parser.resolveOperand(launchOperands[i], types[i], result.operands))
|
|
return failure();
|
|
}
|
|
|
|
for (size_t i = numLaunchOperands; i < regionArgs.size(); ++i) {
|
|
if (parser.resolveOperand(inputOperands[i - numLaunchOperands], types[i],
|
|
result.operands))
|
|
return failure();
|
|
}
|
|
|
|
if (hasStream) {
|
|
if (parser.resolveOperand(streamOperand, streamType, result.operands))
|
|
return failure();
|
|
}
|
|
|
|
if (parser.parseOptionalAttrDict(result.attributes))
|
|
return failure();
|
|
|
|
return success();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// GPUParallelDimAttr
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::get(MLIRContext *context,
|
|
gpu::Processor proc) {
|
|
return processorParDim(context, proc);
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::seqDim(MLIRContext *context) {
|
|
return processorParDim(context, gpu::Processor::Sequential);
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::threadXDim(MLIRContext *context) {
|
|
return processorParDim(context, gpu::Processor::ThreadX);
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::threadYDim(MLIRContext *context) {
|
|
return processorParDim(context, gpu::Processor::ThreadY);
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::threadZDim(MLIRContext *context) {
|
|
return processorParDim(context, gpu::Processor::ThreadZ);
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::blockXDim(MLIRContext *context) {
|
|
return processorParDim(context, gpu::Processor::BlockX);
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::blockYDim(MLIRContext *context) {
|
|
return processorParDim(context, gpu::Processor::BlockY);
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::blockZDim(MLIRContext *context) {
|
|
return processorParDim(context, gpu::Processor::BlockZ);
|
|
}
|
|
|
|
Attribute GPUParallelDimAttr::parse(AsmParser &parser, Type type) {
|
|
GPUParallelDimAttr dim;
|
|
if (parser.parseLess() || parseProcessorValue(parser, dim) ||
|
|
parser.parseGreater()) {
|
|
parser.emitError(parser.getCurrentLocation(),
|
|
"expected format `<` processor_name `>`");
|
|
return {};
|
|
}
|
|
return dim;
|
|
}
|
|
|
|
void GPUParallelDimAttr::print(AsmPrinter &printer) const {
|
|
printer << "<";
|
|
printProcessorValue(printer, *this);
|
|
printer << ">";
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::threadDim(MLIRContext *context,
|
|
unsigned index) {
|
|
assert(index <= 2 && "thread dimension index must be 0, 1, or 2");
|
|
switch (index) {
|
|
case 0:
|
|
return threadXDim(context);
|
|
case 1:
|
|
return threadYDim(context);
|
|
case 2:
|
|
return threadZDim(context);
|
|
}
|
|
llvm_unreachable("validated thread dimension index");
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::blockDim(MLIRContext *context,
|
|
unsigned index) {
|
|
assert(index <= 2 && "block dimension index must be 0, 1, or 2");
|
|
switch (index) {
|
|
case 0:
|
|
return blockXDim(context);
|
|
case 1:
|
|
return blockYDim(context);
|
|
case 2:
|
|
return blockZDim(context);
|
|
}
|
|
llvm_unreachable("validated block dimension index");
|
|
}
|
|
|
|
gpu::Processor GPUParallelDimAttr::getProcessor() const {
|
|
return indexToGpuProcessor(getValue().getInt());
|
|
}
|
|
|
|
int GPUParallelDimAttr::getOrder() const {
|
|
return gpuProcessorIndex(getProcessor());
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::getOneHigher() const {
|
|
int order = getOrder();
|
|
if (order >= 6) // BlockZ is the highest
|
|
return *this;
|
|
return get(getContext(), indexToGpuProcessor(order + 1));
|
|
}
|
|
|
|
GPUParallelDimAttr GPUParallelDimAttr::getOneLower() const {
|
|
int order = getOrder();
|
|
if (order <= 0) // Sequential is the lowest
|
|
return *this;
|
|
return get(getContext(), indexToGpuProcessor(order - 1));
|
|
}
|
|
|
|
bool GPUParallelDimAttr::isSeq() const {
|
|
return getProcessor() == gpu::Processor::Sequential;
|
|
}
|
|
bool GPUParallelDimAttr::isThreadX() const {
|
|
return getProcessor() == gpu::Processor::ThreadX;
|
|
}
|
|
bool GPUParallelDimAttr::isThreadY() const {
|
|
return getProcessor() == gpu::Processor::ThreadY;
|
|
}
|
|
bool GPUParallelDimAttr::isThreadZ() const {
|
|
return getProcessor() == gpu::Processor::ThreadZ;
|
|
}
|
|
bool GPUParallelDimAttr::isBlockX() const {
|
|
return getProcessor() == gpu::Processor::BlockX;
|
|
}
|
|
bool GPUParallelDimAttr::isBlockY() const {
|
|
return getProcessor() == gpu::Processor::BlockY;
|
|
}
|
|
bool GPUParallelDimAttr::isBlockZ() const {
|
|
return getProcessor() == gpu::Processor::BlockZ;
|
|
}
|
|
bool GPUParallelDimAttr::isAnyThread() const {
|
|
return isThreadX() || isThreadY() || isThreadZ();
|
|
}
|
|
bool GPUParallelDimAttr::isAnyBlock() const {
|
|
return isBlockX() || isBlockY() || isBlockZ();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// GPUParallelDimsAttr
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
GPUParallelDimsAttr GPUParallelDimsAttr::seq(MLIRContext *ctx) {
|
|
return GPUParallelDimsAttr::get(ctx, {GPUParallelDimAttr::seqDim(ctx)});
|
|
}
|
|
|
|
bool GPUParallelDimsAttr::isSeq() const {
|
|
assert(!getArray().empty() && "no par_dims found");
|
|
if (getArray().size() == 1) {
|
|
auto parDim = dyn_cast<GPUParallelDimAttr>(getArray()[0]);
|
|
assert(parDim && "expected GPUParallelDimAttr");
|
|
return parDim.isSeq();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool GPUParallelDimsAttr::isParallel() const { return !isSeq(); }
|
|
|
|
bool GPUParallelDimsAttr::isMultiDim() const { return getArray().size() > 1; }
|
|
|
|
bool GPUParallelDimsAttr::hasAnyBlockLevel() const {
|
|
return llvm::any_of(
|
|
getArray(), [](const GPUParallelDimAttr &p) { return p.isAnyBlock(); });
|
|
}
|
|
|
|
bool GPUParallelDimsAttr::hasOnlyBlockLevel() const {
|
|
return !getArray().empty() &&
|
|
llvm::all_of(getArray(), [](const GPUParallelDimAttr &p) {
|
|
return p.isAnyBlock();
|
|
});
|
|
}
|
|
|
|
bool GPUParallelDimsAttr::hasOnlyThreadYLevel() const {
|
|
return !getArray().empty() &&
|
|
llvm::all_of(getArray(), [](const GPUParallelDimAttr &p) {
|
|
return p.isThreadY();
|
|
});
|
|
}
|
|
|
|
bool GPUParallelDimsAttr::hasOnlyThreadXLevel() const {
|
|
return !getArray().empty() &&
|
|
llvm::all_of(getArray(), [](const GPUParallelDimAttr &p) {
|
|
return p.isThreadX();
|
|
});
|
|
}
|
|
|
|
Attribute GPUParallelDimsAttr::parse(AsmParser &parser, Type type) {
|
|
auto delimiter = AsmParser::Delimiter::Square;
|
|
SmallVector<GPUParallelDimAttr> parDims;
|
|
auto parseParDim = [&]() -> ParseResult {
|
|
GPUParallelDimAttr dim;
|
|
if (parseProcessorValue(parser, dim))
|
|
return failure();
|
|
parDims.push_back(dim);
|
|
return success();
|
|
};
|
|
if (parser.parseCommaSeparatedList(delimiter, parseParDim,
|
|
"list of OpenACC GPU parallel dimensions"))
|
|
return {};
|
|
return GPUParallelDimsAttr::get(parser.getContext(), parDims);
|
|
}
|
|
|
|
void GPUParallelDimsAttr::print(AsmPrinter &printer) const {
|
|
printer << "[";
|
|
llvm::interleaveComma(getArray(), printer,
|
|
[&printer](const GPUParallelDimAttr &p) {
|
|
printProcessorValue(printer, p);
|
|
});
|
|
printer << "]";
|
|
}
|