//===- SparseGPUCodegen.cpp - Generates GPU code (using CUDA) -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This is a prototype GPU codegenerator for the sparse compiler. // The objective is to eventually use the right combination of // direct code generation and libary calls into vendor-specific // highly optimized sparse libraries (e.g. cuSparse for CUDA). // //===----------------------------------------------------------------------===// #include "CodegenUtils.h" #include "LoopEmitter.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Matchers.h" using namespace mlir; using namespace mlir::sparse_tensor; namespace { //===----------------------------------------------------------------------===// // Helper methods. //===----------------------------------------------------------------------===// /// Marks the given top module as a GPU container module. static void markAsGPUContainer(ModuleOp topModule) { topModule->setAttr(gpu::GPUDialect::getContainerModuleAttrName(), UnitAttr::get(topModule->getContext())); } /// Constructs a new GPU module (for GPU kernels) inside the given top module. static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule, StringRef name) { markAsGPUContainer(topModule); builder.setInsertionPointToStart(&topModule.getBodyRegion().front()); return builder.create(topModule->getLoc(), name); } /// Constructs a new GPU kernel in the given GPU module. static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule, StringRef name, SmallVectorImpl &args) { builder.setInsertionPointToStart(&gpuModule.getBodyRegion().front()); SmallVector argsTp; for (unsigned i = 0, e = args.size(); i < e; i++) argsTp.push_back(args[i].getType()); FunctionType type = FunctionType::get(gpuModule->getContext(), argsTp, {}); auto gpuFunc = builder.create(gpuModule->getLoc(), name, type); gpuFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); return gpuFunc; } /// Constructs code to launch GPU kernel. static void genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc, SmallVectorImpl &args, unsigned numThreads) { Location loc = gpuFunc->getLoc(); Value none = TypedValue<::mlir::IntegerType>{}; Value one = constantIndex(builder, loc, 1); Value numT = constantIndex(builder, loc, numThreads); gpu::KernelDim3 gridSize = {one, one, one}; gpu::KernelDim3 blckSize = {numT, one, one}; builder.create(loc, gpuFunc, gridSize, blckSize, /*dynSharedMemSz*/ none, args); } /// Maps the provided ranked host buffer into the device address space. /// Writes from the host are guaranteed to be visible to device kernels /// that are launched afterwards. Writes from the device are guaranteed /// to be visible on the host after synchronizing with the device kernel /// completion. static Value genHostRegisterMemref(OpBuilder &builder, Location loc, Value mem) { MemRefType memTp = mem.getType().cast(); UnrankedMemRefType resTp = UnrankedMemRefType::get(memTp.getElementType(), /*memorySpace=*/0); Value cast = builder.create(loc, resTp, mem); builder.create(loc, cast); return mem; // convenience pass-through } /// Constructs code for new GPU kernel. static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc, scf::ParallelOp forallOp, SmallVectorImpl &constants, SmallVectorImpl &scalars, SmallVectorImpl &buffers) { Location loc = gpuFunc->getLoc(); Block &block = gpuFunc.getBody().front(); rewriter.setInsertionPointToStart(&block); // Re-generate the constants, recapture all arguments. unsigned arg = 0; IRMapping irMap; for (Value c : constants) irMap.map(c, rewriter.clone(*c.getDefiningOp())->getResult(0)); for (Value s : scalars) irMap.map(s, block.getArgument(arg++)); for (Value b : buffers) irMap.map(b, block.getArgument(arg++)); // Assume 1-dimensional grid/block configuration (only x dimension), // so that: // row = blockIdx.x * blockDim.x + threadIdx.x // inc = blockDim.x * gridDim.x Value bid = rewriter.create(loc, gpu::Dimension::x); Value bsz = rewriter.create(loc, gpu::Dimension::x); Value tid = rewriter.create(loc, gpu::Dimension::x); Value gsz = rewriter.create(loc, gpu::Dimension::x); Value mul = rewriter.create(loc, bid, bsz); Value row = rewriter.create(loc, mul, tid); Value inc = rewriter.create(loc, bsz, gsz); // Construct the iteration over the computational space that // accounts for the fact that the total number of threads and // the amount of work to be done usually do not match precisely. // for (r = row; r < N; r += inc) { // // } Value upper = irMap.lookup(forallOp.getUpperBound()[0]); scf::ForOp forOp = rewriter.create(loc, row, upper, inc); rewriter.cloneRegionBefore(forallOp.getLoopBody(), forOp.getLoopBody(), forOp.getLoopBody().begin(), irMap); // Done. rewriter.setInsertionPointAfter(forOp); rewriter.create(gpuFunc->getLoc()); } //===----------------------------------------------------------------------===// // Rewriting rules. //===----------------------------------------------------------------------===// /// Proof-of-concept rewriter. This rule generates a CUDA implementation /// for each outermost forall loop generated by the sparse compiler. // // TODO: right works with parallelization-strategy=dense-outer-loop // but give this its own flags in the future // struct ForallRewriter : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; ForallRewriter(MLIRContext *context, unsigned nT) : OpRewritePattern(context), numThreads(nT){}; LogicalResult matchAndRewrite(scf::ParallelOp forallOp, PatternRewriter &rewriter) const override { // Reject inadmissible loop form. // Essentially only accept a loop, generated by the sparse compiler, // of the form // forall (i = 0; i < N; i++) // so that cyclic scheduling over the threads is easy. if (!forallOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName()) || forallOp.getNumReductions() != 0 || forallOp.getNumLoops() != 1 || !matchPattern(forallOp.getLowerBound()[0], m_Zero()) || !matchPattern(forallOp.getStep()[0], m_One())) return failure(); // Collect every value that is computed outside the parallel loop. SetVector invariants; // stable iteration! forallOp->walk([&](Operation *op) { // Collect all values of admissible ops. for (OpOperand &o : op->getOpOperands()) { Value val = o.get(); Block *block; if (auto arg = val.dyn_cast()) block = arg.getOwner(); else block = val.getDefiningOp()->getBlock(); if (!isNestedIn(block, forallOp)) invariants.insert(val); } }); // Outline the outside values as proper parameters. Fail when sharing // value between host and device is not straightforward. SmallVector constants; SmallVector scalars; SmallVector buffers; for (Value val : invariants) { Type tp = val.getType(); if (val.getDefiningOp()) constants.push_back(val); else if (tp.isa() || tp.isIntOrIndex()) scalars.push_back(val); else if (isa(tp)) buffers.push_back(val); else return failure(); // don't know how to share } // Prepare the outlined arguments, register buffers. Location loc = forallOp->getLoc(); SmallVector args; for (Value s : scalars) args.push_back(s); for (Value b : buffers) args.push_back(genHostRegisterMemref(rewriter, loc, b)); auto saveIp = rewriter.saveInsertionPoint(); // Set up GPU module and construct GPU function. // // TODO: only generate once, avoid name conflict // ModuleOp topModule = forallOp->getParentOfType(); auto gpuModule = genGPUModule(rewriter, topModule, "sparsekernels"); auto gpuFunc = genGPUFunc(rewriter, gpuModule, "kernel", args); genGPUCode(rewriter, gpuFunc, forallOp, constants, scalars, buffers); // Generate code that launches the kernel. rewriter.restoreInsertionPoint(saveIp); genLaunchGPUFunc(rewriter, gpuFunc, args, numThreads); rewriter.eraseOp(forallOp); return success(); } private: // Helper method to see if block appears in given loop. static bool isNestedIn(Block *block, scf::ParallelOp forallOp) { for (Operation *o = block->getParentOp(); o; o = o->getParentOp()) { if (o == forallOp) return true; } return false; } unsigned numThreads; }; } // namespace //===----------------------------------------------------------------------===// // Public method for populating GPU rewriting rules. //===----------------------------------------------------------------------===// void mlir::populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads) { patterns.add(patterns.getContext(), numThreads); }