//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===// // // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements utility methods for working with the XeGPU dialect. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/XeVMDialect.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Operation.h" #include "mlir/IR/ValueRange.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormatVariadic.h" #include #include using namespace mlir; /// convert ArrayRef into SmallVector SmallVector xegpu::flattenValues(ArrayRef values) { SmallVector result; for (const auto &vals : values) llvm::append_range(result, vals); return result; } FailureOr mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) { auto layout = llvm::dyn_cast_if_present(tdescTy.getLayout()); // It only works for subgroup level layout, which only has lane_layout // and lane_data, and is to distribute a SIMD code into SIMT code. if (!layout || !layout.isForSubgroup()) return failure(); SmallVector laneData(layout.getLaneData().asArrayRef()); SmallVector laneLayout(layout.getLaneLayout().asArrayRef()); auto tdescShape = tdescTy.getShape(); auto elementType = tdescTy.getElementType(); // compute sgSize by multiply elements of laneLayout // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1] // e.g. for 1D layout, sgSize = laneLayout[0] int64_t sgSize = llvm::product_of(laneLayout); // Case 1: regular loads/stores auto scatterAttr = tdescTy.getEncodingOfType(); if (scatterAttr) { auto chunkSize = scatterAttr.getChunkSize().getInt(); // Verify if the first dimension of the tensor descriptor shape is // distributable. assert(tdescShape[0] == laneLayout[0] && "tensor descriptor shape is not distributable"); return VectorType::get({chunkSize}, elementType); } // Case 2: block loads/stores // Check if the tensor descriptor shape is distributable. int64_t tensorSize = 1; for (auto [tdescDim, laneDim, laneDataDim] : llvm::zip_equal(tdescShape, laneLayout, laneData)) { assert((tdescDim % (laneDim * laneDataDim) == 0) && "tensor descriptor shape is not distributable"); tensorSize *= tdescDim; } // tensorSize must be adjusted for array_length. tensorSize *= tdescTy.getArrayLength(); return VectorType::get({tensorSize / sgSize}, elementType); } FailureOr mlir::xegpu::getDistributedVectorType(VectorType originalType, xegpu::LayoutAttr layout) { int64_t rank = originalType.getRank(); // Distributed vector type is only supported for 1D, 2D and 3D vectors. if (rank < 1 || rank > 3) return failure(); ArrayRef shape = originalType.getShape(); // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension // of the 3D vector. int arrayLength = 1; if (rank == 3) { arrayLength = shape[0]; shape = shape.drop_front(); } auto helperTdescTy = xegpu::TensorDescType::get( shape, originalType.getElementType(), arrayLength, /*boundary_check=*/true, /*memory_space=*/xegpu::MemorySpace::Global, layout); return xegpu::getDistributedVectorType(helperTdescTy); } FailureOr xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout, VectorType originalType) { if (!layout) return failure(); assert((isa(layout) || isa(layout)) && "Expecting a valid layout."); SmallVector effectiveLaneLayout = layout.getEffectiveLaneLayoutAsInt(); assert(static_cast(originalType.getRank()) >= effectiveLaneLayout.size() && "Rank of the original vector type should be greater or equal to the " "size of the lane layout to distribute the vector type."); SmallVector distributedShape(originalType.getShape()); // Only distribute the last `laneLayout.size()` dimensions. The remaining // dimensions are not distributed. unsigned distributionStart = originalType.getRank() - effectiveLaneLayout.size(); for (auto [i, dim] : llvm::enumerate(originalType.getShape())) { if (i < distributionStart) continue; // Check if the dimension can be distributed evenly. if (dim % effectiveLaneLayout[i - distributionStart] != 0) return failure(); distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart]; } return VectorType::get(distributedShape, originalType.getElementType()); } std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); return llvm::formatv("{0}{1}", prefix, idx).str(); } std::string xegpu::getTemporaryLayoutName(const OpResult result) { const StringRef prefix = "layout_result_"; return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { if (!value) return nullptr; if (auto tdescTy = dyn_cast_if_present(value.getType())) return tdescTy.getLayoutAttr(); if (auto result = dyn_cast(value)) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); if (auto anchorOp = dyn_cast(defOp)) { auto layout = anchorOp.getAnchorLayout(); return layout; } std::string layoutName = getTemporaryLayoutName(result); if (defOp->hasAttr(layoutName)) { auto layout = defOp->getAttrOfType(layoutName); return layout; } } if (auto arg = dyn_cast(value)) { auto *parentOp = arg.getOwner()->getParentOp(); if (auto loop = dyn_cast_if_present(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); if (tiedInit) return getDistributeLayoutAttr(tiedInit->get()); } } return nullptr; } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); unsigned idx = const_cast(opr).getOperandNumber(); if (auto anchorOp = dyn_cast(op)) { if (auto dpasOp = dyn_cast(op)) { if (idx == 0) { return dpasOp.getLayoutAAttr(); } else if (idx == 1) { return dpasOp.getLayoutBAttr(); } else if (idx == 2) { return dpasOp.getLayoutCdAttr(); } } if (auto convertOp = dyn_cast(op)) { return convertOp.getInputLayoutAttr(); } auto layout = anchorOp.getAnchorLayout(); if (idx == 0) return layout; // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp), // the layout is valid for the first two operands: value and memref/tdesc. // For other operations, the layout applies to the first operand only. if (isa( op) && (idx < 2)) return layout; } std::string layoutName = xegpu::getTemporaryLayoutName(opr); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; } return nullptr; } // Returns the permanent layout attribute for the given result if it's // available on the defining op. Otherwise returns the provided layout. xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpResult &result, mlir::Operation *owner, const std::string &name) { xegpu::DistributeLayoutAttr candidate = layout; if (auto loadOp = dyn_cast(owner)) { if (auto perm = loadOp.getLayoutAttr()) candidate = perm; } return candidate; } // Returns the permanent layout attribute for the given operand if it's // available on the defining op. Otherwise returns the provided layout. xegpu::DistributeLayoutAttr maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, const OpOperand &operand, mlir::Operation *owner, const std::string &name) { xegpu::DistributeLayoutAttr candidate = layout; unsigned idx = const_cast(operand).getOperandNumber(); if (auto storeOp = dyn_cast(owner)) { if (idx == 0) { if (auto perm = storeOp.getLayoutAttr()) candidate = perm; } } return candidate; } // TODO-LayoutRefactor: Remove this function after replacing use // with setTemporaryLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout) { Operation *owner = result.getOwner(); if (auto anchorOp = dyn_cast(owner)) { if (anchorOp.getAnchorLayout() == layout) return; anchorOp.setAnchorLayout(layout); return; } std::string name = xegpu::getTemporaryLayoutName(result); if (owner->hasAttrOfType(name)) { return; } if (layout) { owner->setAttr(name, layout); } } // TODO-LayoutRefactor: Remove this function after replacing use // with setTemporaryLayout or setAnchorLayout void xegpu::setDistributeLayoutAttr(const OpOperand &operand, const DistributeLayoutAttr layout) { Operation *owner = operand.getOwner(); unsigned idx = const_cast(operand).getOperandNumber(); if (!layout) { return; } if (auto anchorOp = dyn_cast(owner)) { if (auto dpasOp = dyn_cast(owner)) { if (idx == 0) { return dpasOp.setLayoutAAttr(layout); } else if (idx == 1) { return dpasOp.setLayoutBAttr(layout); } else if (idx == 2) { return dpasOp.setLayoutCdAttr(layout); } } if (auto convertOp = dyn_cast(owner)) { return convertOp.setInputLayoutAttr(layout); } // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp), // the layout is valid for the first two operands: value and memref/tdesc. // For other operations, the layout applies to the first operand only. if (isa( owner)) { if (idx < 2) { anchorOp.setAnchorLayout(layout); } } else { if (idx == 0) { anchorOp.setAnchorLayout(layout); } } } std::string name = xegpu::getTemporaryLayoutName(operand); if (owner->hasAttrOfType(name)) { return; } if (layout) { owner->setAttr(name, layout); } } template xegpu::DistributeLayoutAttr xegpu::getTemporaryLayout(const T &operandOrResult) { Operation *op = operandOrResult.getOwner(); std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult); if (op->hasAttr(layoutName)) { auto layout = op->getAttrOfType(layoutName); return layout; } return nullptr; } template xegpu::DistributeLayoutAttr xegpu::getTemporaryLayout(const OpResult &result); template xegpu::DistributeLayoutAttr xegpu::getTemporaryLayout(const OpOperand &operand); template void xegpu::setTemporaryLayout(const T &operandOrResult, const xegpu::DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getTemporaryLayoutName(operandOrResult); if (owner->hasAttrOfType(name)) { return; } if (layout) { owner->setAttr(name, layout); } } template void xegpu::setTemporaryLayout( const mlir::OpResult &result, const mlir::xegpu::DistributeLayoutAttr layout); template void xegpu::setTemporaryLayout( const mlir::OpOperand &operand, const mlir::xegpu::DistributeLayoutAttr layout); SmallVector xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef shape) { auto vecTy = dyn_cast(value.getType()); if (!vecTy) return {value}; ArrayRef srcShape = vecTy.getShape(); if (!computeShapeRatio(srcShape, shape)) return {value}; int64_t srcShapeRank = srcShape.size(); int64_t targetShapeRank = shape.size(); SmallVector adjustedTargetShape(srcShape.size()); int64_t rankDiff = srcShapeRank - targetShapeRank; std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff, 1); llvm::copy(shape, adjustedTargetShape.begin() + rankDiff); SmallVector result; for (SmallVector offsets : StaticTileOffsetRange(srcShape, adjustedTargetShape)) { SmallVector staticStrides(offsets.size(), 1); Value slice = vector::ExtractStridedSliceOp::create( builder, loc, value, offsets, adjustedTargetShape, staticStrides); // Reshape to remove leading unit dims if needed if (srcShapeRank > targetShapeRank) { auto targetTy = VectorType::get(shape, vecTy.getElementType()); slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice); } result.push_back(slice); } return result; } Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc, ValueRange values, ArrayRef shape) { VectorType inputTy = dyn_cast(values[0].getType()); assert(llvm::all_of(values.getTypes(), [&](Type type) { return type == inputTy; }) && "values must be of the same VectorType"); Type elemTy = inputTy.getElementType(); ArrayRef tileShape = inputTy.getShape(); VectorType resultTy = VectorType::get(shape, elemTy); auto zeroAttr = builder.getZeroAttr(elemTy); Value result = arith::ConstantOp::create( builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr)); for (auto [src, offsets] : llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) { SmallVector staticStrides(tileShape.size(), 1); result = vector::InsertStridedSliceOp::create(builder, loc, src, result, offsets, staticStrides); } return result; } void xegpu::doSCFStructuralTypeConversionWithTensorType( Operation *op, TypeConverter converter) { MLIRContext *context = op->getContext(); auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs, Location loc) -> Value { return UnrealizedConversionCastOp::create(builder, loc, type, inputs) .getResult(0); }; { // convert VectorType to RankedTensorType for SCF Structural ops TypeConverter converter; converter.addConversion([](Type type) -> Type { return type; }); converter.addConversion([](VectorType type) -> Type { return RankedTensorType::get(type.getShape(), type.getElementType()); }); converter.addSourceMaterialization(materializeCast); converter.addTargetMaterialization(materializeCast); mlir::ConversionTarget target(*context); target.addLegalOp(); mlir::RewritePatternSet patterns(context); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } { // propagate the layout attribute to RankedTensorType by checking // BuiltInUnrealizedCastOps // for VectorType to RankedTensorType cast. op->walk([](UnrealizedConversionCastOp castOp) { if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1) return WalkResult::skip(); Value input = castOp.getInputs()[0]; Value result = castOp.getResults()[0]; auto inputTy = dyn_cast(input.getType()); auto resultTy = dyn_cast(result.getType()); // Only look at ops casting from VectorType to RankedTensorType if (!inputTy || !resultTy) return WalkResult::skip(); xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(input); if (!layout) return WalkResult::skip(); RankedTensorType newTy = resultTy.cloneWithEncoding(layout); result.setType(newTy); // update the arguments if user is a LoopLike op. for (OpOperand &use : result.getUses()) { if (auto loop = dyn_cast(use.getOwner())) { BlockArgument arg = loop.getTiedLoopRegionIterArg(&use); arg.setType(newTy); } // whileOp has two regions, the BlockArgument of the after region // is not exposed by LoopLikeOpInterface if (auto whileOp = dyn_cast(use.getOwner())) { unsigned idx = use.getOperandNumber(); BlockArgument arg = whileOp.getAfterArguments()[idx]; arg.setType(newTy); } } return WalkResult::advance(); }); // using yieldOp as anchor to update the result type of its ParentOp op->walk([](scf::YieldOp yieldOp) { Operation *parentOp = yieldOp->getParentOp(); for (OpResult r : parentOp->getOpResults()) { unsigned idx = r.getResultNumber(); Type resultTy = r.getType(); Type yieldTy = yieldOp.getResults()[idx].getType(); if (isa(resultTy) && yieldTy != resultTy) r.setType(yieldTy); } }); } { // perform the conversion from RankedTensorType to VectorType based on the // DistributeLayoutAttr // Handle the UnrealizedConversionCastOp introduced by the first step. // For vector->RankedTensorType, it will simply forward the inputs. // For RankedTensorType->vector, it will update the inputs with the // one from the adaptor. class UnrealizedConversionCastOpPattern : public OpConversionPattern { using OpConversionPattern< mlir::UnrealizedConversionCastOp>::OpConversionPattern; mlir::LogicalResult matchAndRewrite(mlir::UnrealizedConversionCastOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto inputs = op.getOperands(); auto outputs = op.getOutputs(); if (inputs.size() != 1 || outputs.size() != 1) return failure(); auto inputTy = inputs[0].getType(); auto outputTy = outputs[0].getType(); if (isa(inputTy) && isa(outputTy)) { rewriter.replaceOpWithMultiple(op, adaptor.getInputs()); return success(); } if (isa(inputTy) && isa(outputTy)) { SmallVector values = xegpu::flattenValues(adaptor.getInputs()); auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(), outputTy, values); rewriter.replaceOp(op, newOp); return success(); } return failure(); } }; converter.addSourceMaterialization(materializeCast); converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type, ValueRange inputs, Location loc) { return UnrealizedConversionCastOp::create(builder, loc, type, inputs) .getResults(); }); mlir::ConversionTarget target(*context); target.addDynamicallyLegalOp( [](UnrealizedConversionCastOp op) { auto isTensorTy = [](Type type) { return isa(type); }; return llvm::none_of(op->getOperandTypes(), isTensorTy) && llvm::none_of(op->getResultTypes(), isTensorTy); }); mlir::RewritePatternSet patterns(context); patterns.insert(context); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); (void)mlir::applyPartialConversion(op, target, std::move(patterns)); } } std::optional xegpu::getChipStr(Operation *op) { auto gpuModuleOp = op->getParentOfType(); if (!gpuModuleOp) return std::nullopt; auto targetAttrs = gpuModuleOp.getTargets(); if (targetAttrs) { for (auto &attr : *targetAttrs) { auto xevmAttr = llvm::dyn_cast(attr); if (xevmAttr) return xevmAttr.getChip().str(); } } return std::nullopt; } /// Generates element-wise addition ops of two arrays with same length. SmallVector xegpu::addElementwise(OpBuilder &builder, Location loc, ArrayRef lhs, ArrayRef rhs) { assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size"); SmallVector results; for (auto [l, r] : llvm::zip_equal(lhs, rhs)) { auto lval = getValueOrCreateConstantIndexOp(builder, loc, l); auto rval = getValueOrCreateConstantIndexOp(builder, loc, r); results.push_back(builder.createOrFold(loc, lval, rval)); } return results; } /// Generates element-wise addition ops of two arrays with automatic alignment. /// When the input arrays have different sizes, the shorter array is /// right-aligned with the longer array, and the unmatched leading elements from /// the longer array are preserved unchanged. This is commonly used for offset /// computation where higher-dimensional offsets need to be added to /// lower-dimensional adjustments. /// /// Example: /// lhs = [l1, l2, l3], rhs = [r1, r2] /// Result: [11, l2+r1, l3+r2] SmallVector xegpu::addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef lhs, ArrayRef rhs) { // ensure a is longer than b ArrayRef a = lhs.size() >= rhs.size() ? lhs : rhs; ArrayRef b = lhs.size() >= rhs.size() ? rhs : lhs; SmallVector results(a.take_front(a.size() - b.size())); a = a.slice(a.size() - b.size()); results.append(addElementwise(builder, loc, a, b)); return results; } template int xegpu::getLargestDivisor(T dim, ArrayRef candidates, ArrayRef candidateMultiples) { static_assert(std::is_integral::value, "T must be an integer type"); int largest = -1; SmallVector multiples = {1}; if (!candidateMultiples.empty()) multiples = SmallVector(candidateMultiples.begin(), candidateMultiples.end()); for (T candidate : candidates) { for (T multiple : multiples) { int value = static_cast(candidate * multiple); if (value != 0 && dim % value == 0 && value > largest) largest = value; } } return largest; } /// Explicit instantiations template int xegpu::getLargestDivisor(int dim, ArrayRef candidates, ArrayRef candidateMultiples); template int xegpu::getLargestDivisor(unsigned dim, ArrayRef candidates, ArrayRef candidateMultiples); bool xegpu::requirePacked(const xegpu::LayoutAttr layout) { if (!layout) return false; auto laneData = layout.getEffectiveLaneDataAsInt(); if (laneData.size() != 2) return false; return laneData[0] != 1; } bool xegpu::requireTranspose(const xegpu::LayoutAttr layout, const xegpu::uArch::uArch *uArch) { // Return false for unsupported targets. // TODO: Add more support or move to target info. if (uArch->getName().equals_insensitive("pvc") && uArch->getName().equals_insensitive("bmg")) return false; if (!layout) return false; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); if (laneLayout.size() != 2) return false; return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1; } // Check if dst shape is an expansion of src shape by inserting unit dimensions. // Returns true if all dimensions in src match corresponding dimensions in dst // (after skipping unit dimensions), and populates expandedUnitDims with the // indices of the unit dimensions in dst that were added (not present in src). // Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3] bool xegpu::matchUnitDimExpansion(ArrayRef src, ArrayRef dst, SmallVector &expandedUnitDims) { // All unit dimensions in dst that don't appear in src are the expanded // unit dimensions size_t srcIdx = 0; for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx]) srcIdx++; else if (dst[dstIdx] == 1) expandedUnitDims.push_back(dstIdx); else return false; return srcIdx == src.size(); } // Checks if dst shape is an expansion of src shape where each dimension in src // is split into one or more consecutive dimensions in dst whose product equals // the original dimension. Populates splitDimGroups with groups of dst indices // that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] -> // true bool xegpu::matchSplitDimExpansion( ArrayRef src, ArrayRef dst, SmallVector> &splitDimGroups) { // each dim in src can be mapped to one or more dims in dst whose product // equals to the src dim size_t srcIdx = 0; int64_t accumulatedSize = 1; SmallVector currentDstDims; splitDimGroups.clear(); for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) { if (srcIdx >= src.size()) return false; accumulatedSize *= dst[dstIdx]; currentDstDims.push_back(dstIdx); if (accumulatedSize == src[srcIdx]) { // Record the mapping: srcIdx -> currentDstDims splitDimGroups.push_back(currentDstDims); // move to next src dim srcIdx++; accumulatedSize = 1; currentDstDims.clear(); } else if (accumulatedSize > src[srcIdx]) { return false; } } return srcIdx == src.size(); }