This reverts commit 747620db2a02b889ae3ba3921d6c0e526a3e7677. Multiple bot failures
This commit is contained in:
parent
611f47c46c
commit
b99e57583e
@ -6,6 +6,7 @@
|
|||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
|
||||||
#ifndef MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
|
#ifndef MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
|
||||||
#define MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
|
#define MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
|
||||||
|
|
||||||
@ -17,7 +18,9 @@ def XeGPUFoldAliasOps : Pass<"xegpu-fold-alias-ops"> {
|
|||||||
The pass folds aliasing ops into XeGPU ops that they operate on the original
|
The pass folds aliasing ops into XeGPU ops that they operate on the original
|
||||||
source references.
|
source references.
|
||||||
}];
|
}];
|
||||||
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect"];
|
let dependentDialects = [
|
||||||
|
"memref::MemRefDialect", "xegpu::XeGPUDialect"
|
||||||
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
|
def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
|
||||||
@ -25,24 +28,14 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
|
|||||||
let description = [{
|
let description = [{
|
||||||
The pass distributes subgroup level (SIMD) XeGPU ops to work items.
|
The pass distributes subgroup level (SIMD) XeGPU ops to work items.
|
||||||
}];
|
}];
|
||||||
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
|
let dependentDialects = [
|
||||||
"vector::VectorDialect"];
|
"memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
|
||||||
let options = [Option<
|
];
|
||||||
"printOnly", "print-analysis-only", "bool",
|
let options = [
|
||||||
|
Option<"printOnly", "print-analysis-only", "bool",
|
||||||
/*default=*/"false",
|
/*default=*/"false",
|
||||||
"Print the result of the subgroup map propagation analysis and exit.">];
|
"Print the result of the subgroup map propagation analysis and exit.">
|
||||||
}
|
];
|
||||||
|
|
||||||
def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {
|
|
||||||
let summary = "Transform WorkGroup level XeGPU code to SubGroup level";
|
|
||||||
let description = [{
|
|
||||||
This transform pass distributes the workgroup level computation to
|
|
||||||
multiple subgroups based on the sg_layout and sg_data attributes.
|
|
||||||
}];
|
|
||||||
|
|
||||||
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
|
|
||||||
"vector::VectorDialect", "arith::ArithDialect",
|
|
||||||
"gpu::GPUDialect", "index::IndexDialect"];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
|
#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
|
||||||
|
@ -62,7 +62,6 @@ void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
|
|||||||
|
|
||||||
/// Appends patterns for XeGPU SIMT distribution into `patterns`.
|
/// Appends patterns for XeGPU SIMT distribution into `patterns`.
|
||||||
void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
|
void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
|
||||||
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
|
|
||||||
|
|
||||||
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
|
/// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
|
||||||
/// Users can control whether an operation to be unrolled or not, as well as
|
/// Users can control whether an operation to be unrolled or not, as well as
|
||||||
|
@ -2,7 +2,6 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
|
|||||||
XeGPUFoldAliasOps.cpp
|
XeGPUFoldAliasOps.cpp
|
||||||
XeGPUSubgroupDistribute.cpp
|
XeGPUSubgroupDistribute.cpp
|
||||||
XeGPUUnroll.cpp
|
XeGPUUnroll.cpp
|
||||||
XeGPUWgToSgDistribute.cpp
|
|
||||||
|
|
||||||
ADDITIONAL_HEADER_DIRS
|
ADDITIONAL_HEADER_DIRS
|
||||||
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
|
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
|
||||||
|
@ -1,378 +0,0 @@
|
|||||||
//===- XeGPUWgToSgDistribute.cpp - XeGPU Workgroup to Subgroup Pass -------===//
|
|
||||||
//
|
|
||||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
||||||
// See https://llvm.org/LICENSE.txt for license information.
|
|
||||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
|
|
||||||
|
|
||||||
#include "mlir/Dialect/Affine/Utils.h"
|
|
||||||
#include "mlir/Dialect/Arith/Utils/Utils.h"
|
|
||||||
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
|
||||||
#include "mlir/Dialect/Index/IR/IndexDialect.h"
|
|
||||||
#include "mlir/Dialect/Index/IR/IndexOps.h"
|
|
||||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
||||||
#include "mlir/Dialect/Utils/IndexingUtils.h"
|
|
||||||
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
|
|
||||||
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
|
|
||||||
#include "mlir/Transforms/DialectConversion.h"
|
|
||||||
|
|
||||||
namespace mlir {
|
|
||||||
namespace xegpu {
|
|
||||||
#define GEN_PASS_DEF_XEGPUWGTOSGDISTRIBUTE
|
|
||||||
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
|
|
||||||
} // namespace xegpu
|
|
||||||
} // namespace mlir
|
|
||||||
|
|
||||||
using namespace mlir;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
/// This pattern transforms the CreateNdDescOp to create a subgroup descriptor
|
|
||||||
/// from a workgroup descriptor. It replaces the offsets and sizes with
|
|
||||||
/// appropriate values for the subgroup.
|
|
||||||
/// It uses round-robin assignment to distribute the work to the subgroups.
|
|
||||||
/// Following create_nd_desc operation:,
|
|
||||||
/// %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x24xf32>
|
|
||||||
/// -> !xegpu.tensor_desc<24x24xf32, #xegpu.layout<sg_layout = [4, 4],
|
|
||||||
/// sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
/// is converted to 9 subgroup level operations based on the sg_layout &
|
|
||||||
/// sg_data:
|
|
||||||
/// %tdesc = xegpu.create_nd_tdesc %src[off1, off2] : memref<24x24xf32> ->
|
|
||||||
/// !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2],
|
|
||||||
/// lane_data = [1, 1]>>
|
|
||||||
///
|
|
||||||
/// The sg_layout and sg_data attributes are dropped after the pass as they are
|
|
||||||
/// no longer needed.
|
|
||||||
///
|
|
||||||
/// 24x24 matrix distribution example:
|
|
||||||
/// sg_layout = [4, 4], sg_data = [2, 2]
|
|
||||||
/// Each 8x8 matrix within the 24x24 matrix is called a distribution unit.
|
|
||||||
/// dist_unit_shape = [8, 8] --> sg_layout[i] * sg_data[i]
|
|
||||||
///
|
|
||||||
/// +------------------------+
|
|
||||||
/// | 8x8 | 8x8 | 8x8 | <- 3 tiles across
|
|
||||||
/// |-----+-----+-----|
|
|
||||||
/// | 8x8 | 8x8 | 8x8 | <- 3 tiles down
|
|
||||||
/// |-----+-----+-----|
|
|
||||||
/// | 8x8 | 8x8 | 8x8 |
|
|
||||||
/// +------------------------+
|
|
||||||
///
|
|
||||||
/// Each 8x8 tile is further subdivided among subgroups:
|
|
||||||
/// +------------------------+
|
|
||||||
/// | 2x2 2x2 2x2 2x2 | <- 4 subgroups across (each handles 2 columns)
|
|
||||||
/// | 2x2 2x2 2x2 2x2 | <- 4 subgroups down (each handles 2 rows)
|
|
||||||
/// | 2x2 2x2 2x2 2x2 |
|
|
||||||
/// | 2x2 2x2 2x2 2x2 |
|
|
||||||
/// +------------------------+
|
|
||||||
///
|
|
||||||
/// Since the 24x24 matrix is divided into 8x8 distribution units, there will be
|
|
||||||
/// 9 distribution units (3x3) in total. Hence the 9 subgroup level operations.
|
|
||||||
|
|
||||||
/// The pass currently has entire distribution logic in the WgToSgCreateNdOp
|
|
||||||
/// pattern and all the other ops just follow.
|
|
||||||
/// TODO: Decouple the distribution logic from WgToSgCreateNdOp for all the
|
|
||||||
/// ops in the pass.
|
|
||||||
struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
|
|
||||||
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
|
|
||||||
|
|
||||||
// Calculate offset for each subgroup
|
|
||||||
SmallVector<OpFoldResult>
|
|
||||||
calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
|
|
||||||
const SmallVector<OpFoldResult> &originalOffsets,
|
|
||||||
const SmallVector<Value> &localOffset,
|
|
||||||
const SmallVector<int64_t> &distUnitBaseAddr,
|
|
||||||
const SmallVector<int64_t> &distUnitShape) const {
|
|
||||||
assert(localOffset.size() == distUnitBaseAddr.size() &&
|
|
||||||
"localOffset and distUnitBaseAddr must have the same rank");
|
|
||||||
|
|
||||||
SmallVector<OpFoldResult> globalOffsets(originalOffsets.begin(),
|
|
||||||
originalOffsets.end());
|
|
||||||
size_t rank = localOffset.size();
|
|
||||||
for (size_t i = 0; i < rank; ++i) {
|
|
||||||
size_t dimIdx = originalOffsets.size() - rank + i;
|
|
||||||
Value constOffset =
|
|
||||||
rewriter.create<arith::ConstantIndexOp>(loc, distUnitBaseAddr[i]);
|
|
||||||
Value offset =
|
|
||||||
rewriter.createOrFold<index::AddOp>(loc, localOffset[i], constOffset);
|
|
||||||
Value modValue =
|
|
||||||
rewriter.create<arith::ConstantIndexOp>(loc, distUnitShape[i]);
|
|
||||||
Value offsetMod =
|
|
||||||
rewriter.createOrFold<index::RemUOp>(loc, offset, modValue);
|
|
||||||
Value origOffset = getValueOrCreateConstantIndexOp(
|
|
||||||
rewriter, loc, originalOffsets[dimIdx]);
|
|
||||||
Value globalOffset =
|
|
||||||
rewriter.createOrFold<index::AddOp>(loc, origOffset, offsetMod);
|
|
||||||
globalOffsets[dimIdx] = globalOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
return globalOffsets;
|
|
||||||
}
|
|
||||||
|
|
||||||
LogicalResult
|
|
||||||
matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
|
|
||||||
ConversionPatternRewriter &rewriter) const override {
|
|
||||||
Location loc = op.getLoc();
|
|
||||||
MLIRContext *ctx = op.getContext();
|
|
||||||
xegpu::TensorDescType tdescTy = op.getType();
|
|
||||||
auto layout = dyn_cast<xegpu::LayoutAttr>(tdescTy.getLayout());
|
|
||||||
if (!layout)
|
|
||||||
return failure();
|
|
||||||
Type elemTy = tdescTy.getElementType();
|
|
||||||
ArrayRef<int64_t> wgShape = tdescTy.getShape();
|
|
||||||
// sgLayout must be present for workgroup-level distribution.
|
|
||||||
SmallVector<int64_t> sgLayout;
|
|
||||||
if (auto sgLayoutAttr = layout.getSgLayout())
|
|
||||||
sgLayout = llvm::to_vector_of<int64_t>(sgLayoutAttr.asArrayRef());
|
|
||||||
else
|
|
||||||
return rewriter.notifyMatchFailure(
|
|
||||||
op, "sgLayout attribute is required in layout");
|
|
||||||
|
|
||||||
SmallVector<int64_t> sgShape;
|
|
||||||
if (auto sgDataAttr = layout.getSgData()) {
|
|
||||||
sgShape = llvm::to_vector_of<int64_t>(sgDataAttr.asArrayRef());
|
|
||||||
} else {
|
|
||||||
assert(wgShape.size() == sgLayout.size() &&
|
|
||||||
"sgLayout and wgShape must have the same rank");
|
|
||||||
sgShape.reserve(wgShape.size());
|
|
||||||
for (size_t i = 0; i < wgShape.size(); ++i) {
|
|
||||||
assert(sgLayout[i] != 0 && "sgLayout elements must be non-zero");
|
|
||||||
sgShape.push_back(wgShape[i] / sgLayout[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO : Handle order attribute
|
|
||||||
// Get the subgroup ID
|
|
||||||
auto linearSgId =
|
|
||||||
rewriter.create<gpu::SubgroupIdOp>(loc, /*upper_bound=*/nullptr);
|
|
||||||
|
|
||||||
// Create constants for layout dimensions
|
|
||||||
SmallVector<Value> sgLayoutDim(sgLayout.size());
|
|
||||||
SmallVector<Value> sgDataDim(sgShape.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < sgLayout.size(); i++) {
|
|
||||||
sgLayoutDim[i] =
|
|
||||||
rewriter.create<arith::ConstantIndexOp>(loc, sgLayout[i]);
|
|
||||||
sgDataDim[i] = rewriter.create<arith::ConstantIndexOp>(loc, sgShape[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto deLinearizeSgId =
|
|
||||||
affine::delinearizeIndex(rewriter, loc, linearSgId, sgLayoutDim);
|
|
||||||
if (failed(deLinearizeSgId))
|
|
||||||
return failure();
|
|
||||||
SmallVector<Value> sgIds = *deLinearizeSgId;
|
|
||||||
|
|
||||||
// Calculate distribution unit shape and local offsets for subgroup
|
|
||||||
SmallVector<int64_t> distUnitShape(sgLayout.size());
|
|
||||||
SmallVector<Value> localOffset(sgLayout.size());
|
|
||||||
for (size_t i = 0; i < sgLayout.size(); i++) {
|
|
||||||
distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], wgShape[i]);
|
|
||||||
localOffset[i] =
|
|
||||||
rewriter.createOrFold<index::MulOp>(loc, sgIds[i], sgDataDim[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
SmallVector<OpFoldResult> originalOffsets = op.getMixedOffsets();
|
|
||||||
|
|
||||||
xegpu::TensorDescType newTdescTy =
|
|
||||||
xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
|
|
||||||
layout.dropSgLayoutAndData());
|
|
||||||
SmallVector<Value> newCreateNdOps;
|
|
||||||
for (SmallVector<int64_t> distUnitBaseAddr :
|
|
||||||
StaticTileOffsetRange(wgShape, distUnitShape)) {
|
|
||||||
SmallVector<OpFoldResult> globalOffsets =
|
|
||||||
calculateGlobalOffsets(rewriter, loc, originalOffsets, localOffset,
|
|
||||||
distUnitBaseAddr, distUnitShape);
|
|
||||||
|
|
||||||
auto newCreateNdOp = rewriter.create<xegpu::CreateNdDescOp>(
|
|
||||||
loc, newTdescTy, op.getSource(), globalOffsets, op.getMixedSizes(),
|
|
||||||
op.getMixedStrides());
|
|
||||||
newCreateNdOps.push_back(newCreateNdOp);
|
|
||||||
}
|
|
||||||
|
|
||||||
rewriter.replaceOpWithMultiple(op, {newCreateNdOps});
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/// This pattern transforms the LoadNdOp to load subgroup data.
|
|
||||||
struct WgToSgLoadNdOp : public OpConversionPattern<xegpu::LoadNdOp> {
|
|
||||||
using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
|
|
||||||
LogicalResult
|
|
||||||
matchAndRewrite(xegpu::LoadNdOp op, OneToNOpAdaptor adaptor,
|
|
||||||
ConversionPatternRewriter &rewriter) const override {
|
|
||||||
SmallVector<Value> newLoadOps;
|
|
||||||
for (auto src : adaptor.getTensorDesc()) {
|
|
||||||
xegpu::TensorDescType tdescTy =
|
|
||||||
dyn_cast<xegpu::TensorDescType>(src.getType());
|
|
||||||
ArrayRef<int64_t> srcShape = tdescTy.getShape();
|
|
||||||
VectorType newResTy = VectorType::get(srcShape, tdescTy.getElementType());
|
|
||||||
auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(op.getLoc(), newResTy,
|
|
||||||
src, op->getAttrs());
|
|
||||||
newLoadOps.push_back(newLoadOp);
|
|
||||||
}
|
|
||||||
rewriter.replaceOpWithMultiple(op, {newLoadOps});
|
|
||||||
return mlir::success();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/// This pattern transforms the StoreNdOp to store to a subgroup descriptor
|
|
||||||
/// It creates a StoreNdOp op to store the updated values to the new subgroup
|
|
||||||
/// src tensor descriptors.
|
|
||||||
struct WgToSgStoreNdOp : public OpConversionPattern<xegpu::StoreNdOp> {
|
|
||||||
using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
|
|
||||||
LogicalResult
|
|
||||||
matchAndRewrite(xegpu::StoreNdOp op, OneToNOpAdaptor adaptor,
|
|
||||||
ConversionPatternRewriter &rewriter) const override {
|
|
||||||
for (auto [v, t] : llvm::zip(adaptor.getValue(), adaptor.getTensorDesc()))
|
|
||||||
rewriter.create<xegpu::StoreNdOp>(op.getLoc(), v, t, op.getL1HintAttr(),
|
|
||||||
op.getL2HintAttr(), op.getL3HintAttr());
|
|
||||||
|
|
||||||
rewriter.eraseOp(op);
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/// This pattern transforms the UpdateNdOffsetOp to update the offsets of a
|
|
||||||
/// subgroup descriptor. It creates an UpdateNdOffsetOp op to update the
|
|
||||||
/// offsets of the new subgroup src tensor descriptors.
|
|
||||||
struct WgToSgUpdateNdOffsetOp
|
|
||||||
: public OpConversionPattern<xegpu::UpdateNdOffsetOp> {
|
|
||||||
using OpConversionPattern<xegpu::UpdateNdOffsetOp>::OpConversionPattern;
|
|
||||||
LogicalResult
|
|
||||||
matchAndRewrite(xegpu::UpdateNdOffsetOp op, OneToNOpAdaptor adaptor,
|
|
||||||
ConversionPatternRewriter &rewriter) const override {
|
|
||||||
llvm::SmallVector<Value> newUpdateTileOffsetOps;
|
|
||||||
for (auto tDesc : adaptor.getTensorDesc()) {
|
|
||||||
auto newUpdateTileOffsetOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
|
|
||||||
op.getLoc(), tDesc.getType(), tDesc, op.getOffsets(),
|
|
||||||
op.getConstOffsets());
|
|
||||||
newUpdateTileOffsetOps.push_back(newUpdateTileOffsetOp);
|
|
||||||
}
|
|
||||||
|
|
||||||
rewriter.replaceOpWithMultiple(op, {newUpdateTileOffsetOps});
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/// This pattern transforms the DpasOp to work at subgroup level.
|
|
||||||
struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
|
|
||||||
using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
|
|
||||||
LogicalResult
|
|
||||||
matchAndRewrite(xegpu::DpasOp op, OneToNOpAdaptor adaptor,
|
|
||||||
ConversionPatternRewriter &rewriter) const override {
|
|
||||||
Location loc = op.getLoc();
|
|
||||||
VectorType resultTy = op.getResult().getType();
|
|
||||||
if (resultTy.getRank() != 2)
|
|
||||||
return failure();
|
|
||||||
|
|
||||||
auto originalLayout =
|
|
||||||
llvm::dyn_cast_or_null<xegpu::LayoutAttr>(op->getAttr("layout"));
|
|
||||||
if (!originalLayout)
|
|
||||||
return failure();
|
|
||||||
|
|
||||||
SmallVector<Value> newDpasOps;
|
|
||||||
size_t i = 0;
|
|
||||||
for (auto aVec : adaptor.getLhs()) {
|
|
||||||
for (auto bVec : adaptor.getRhs()) {
|
|
||||||
llvm::SmallVector<Value> operands({aVec, bVec});
|
|
||||||
Value tmpC;
|
|
||||||
if (op.getAcc()) {
|
|
||||||
tmpC = adaptor.getAcc()[i++];
|
|
||||||
operands.push_back(tmpC);
|
|
||||||
}
|
|
||||||
|
|
||||||
ArrayRef<int64_t> aVecShape =
|
|
||||||
llvm::cast<VectorType>(aVec.getType()).getShape();
|
|
||||||
ArrayRef<int64_t> bVecShape =
|
|
||||||
llvm::cast<VectorType>(bVec.getType()).getShape();
|
|
||||||
VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]},
|
|
||||||
resultTy.getElementType());
|
|
||||||
tmpC = rewriter.create<xegpu::DpasOp>(
|
|
||||||
loc, resTy, operands,
|
|
||||||
llvm::ArrayRef<NamedAttribute>(
|
|
||||||
{"layout_result_0", originalLayout.dropSgLayoutAndData()}));
|
|
||||||
newDpasOps.push_back(tmpC);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rewriter.replaceOpWithMultiple(op, {newDpasOps});
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/// This pattern transforms the PrefetchNdOp to prefetch the subgroup data.
|
|
||||||
struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
|
|
||||||
using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
|
|
||||||
LogicalResult
|
|
||||||
matchAndRewrite(xegpu::PrefetchNdOp op, OneToNOpAdaptor adaptor,
|
|
||||||
ConversionPatternRewriter &rewriter) const override {
|
|
||||||
for (auto src : adaptor.getTensorDesc())
|
|
||||||
rewriter.create<xegpu::PrefetchNdOp>(op.getLoc(), TypeRange(), src,
|
|
||||||
op->getAttrs());
|
|
||||||
rewriter.eraseOp(op);
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
namespace mlir {
|
|
||||||
namespace xegpu {
|
|
||||||
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
|
|
||||||
patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
|
|
||||||
WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp>(
|
|
||||||
patterns.getContext());
|
|
||||||
}
|
|
||||||
} // namespace xegpu
|
|
||||||
} // namespace mlir
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
struct XeGPUWgToSgDistributePass
|
|
||||||
: public xegpu::impl::XeGPUWgToSgDistributeBase<XeGPUWgToSgDistributePass> {
|
|
||||||
void runOnOperation() override;
|
|
||||||
};
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
void XeGPUWgToSgDistributePass::runOnOperation() {
|
|
||||||
MLIRContext *ctx = &getContext();
|
|
||||||
RewritePatternSet patterns(ctx);
|
|
||||||
ConversionTarget target(*ctx);
|
|
||||||
|
|
||||||
auto getTensorDescType = [](Operation *op) -> xegpu::TensorDescType {
|
|
||||||
if (auto createOp = dyn_cast<xegpu::CreateNdDescOp>(op))
|
|
||||||
return createOp.getType();
|
|
||||||
if (auto loadOp = dyn_cast<xegpu::LoadNdOp>(op))
|
|
||||||
return loadOp.getTensorDescType();
|
|
||||||
if (auto storeOp = dyn_cast<xegpu::StoreNdOp>(op))
|
|
||||||
return storeOp.getTensorDescType();
|
|
||||||
if (auto updateOp = dyn_cast<xegpu::UpdateNdOffsetOp>(op))
|
|
||||||
return updateOp.getType();
|
|
||||||
if (auto prefetchOp = dyn_cast<xegpu::PrefetchNdOp>(op))
|
|
||||||
return prefetchOp.getTensorDescType();
|
|
||||||
return xegpu::TensorDescType();
|
|
||||||
};
|
|
||||||
|
|
||||||
auto isLegal = [&](xegpu::LayoutAttr layout) -> bool {
|
|
||||||
return !layout || layout.getSgLayout() == nullptr;
|
|
||||||
};
|
|
||||||
|
|
||||||
target.addDynamicallyLegalOp<xegpu::CreateNdDescOp, xegpu::LoadNdOp,
|
|
||||||
xegpu::StoreNdOp, xegpu::UpdateNdOffsetOp,
|
|
||||||
xegpu::PrefetchNdOp>([=](Operation *op) -> bool {
|
|
||||||
auto tdescTy = getTensorDescType(op);
|
|
||||||
auto layout = dyn_cast_or_null<xegpu::LayoutAttr>(tdescTy.getLayout());
|
|
||||||
return isLegal(layout);
|
|
||||||
});
|
|
||||||
|
|
||||||
target.addDynamicallyLegalOp<xegpu::DpasOp>([=](xegpu::DpasOp op) -> bool {
|
|
||||||
auto layout = dyn_cast_or_null<xegpu::LayoutAttr>(op->getAttr("layout"));
|
|
||||||
return isLegal(layout);
|
|
||||||
});
|
|
||||||
|
|
||||||
target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
|
|
||||||
|
|
||||||
xegpu::populateXeGPUWgToSgDistributePatterns(patterns);
|
|
||||||
if (failed(
|
|
||||||
applyPartialConversion(getOperation(), target, std::move(patterns))))
|
|
||||||
return signalPassFailure();
|
|
||||||
}
|
|
@ -1,105 +0,0 @@
|
|||||||
// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
|
|
||||||
|
|
||||||
gpu.module @test_round_robin_assignment {
|
|
||||||
// CHECK-LABEL: test_create_nd_tdesc
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
|
|
||||||
// CHECK-COUNT-12: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<24x32xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_load_nd_tdesc
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-COUNT-12: xegpu.load_nd %{{.*}}
|
|
||||||
// CHECK-SAME-COUNT-12: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-SAME-COUNT-12: -> vector<2x2xf32>
|
|
||||||
// CHECK-NOT: xegpu.load_nd
|
|
||||||
%load = xegpu.load_nd %tdesc
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
-> vector<24x32xf32>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_store_nd
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_store_nd(%src: memref<24x32xf32>) {
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-COUNT-12: xegpu.store_nd %{{.*}}, %{{.*}}
|
|
||||||
// CHECK-SAME-COUNT-12: : vector<2x2xf32>, !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-NOT : xegpu.store_nd
|
|
||||||
%load = xegpu.load_nd %tdesc
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
-> vector<24x32xf32>
|
|
||||||
xegpu.store_nd %load, %tdesc
|
|
||||||
: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_update_nd
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_update_nd(%src: memref<24x32xf32>){
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-COUNT-12: xegpu.update_nd_offset %{{.*}}, [0, 16]
|
|
||||||
// CHECK-SAME-COUNT-12: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-NOT: xegpu.update_nd_offset
|
|
||||||
%update = xegpu.update_nd_offset %tdesc, [0, 16]
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_dpas
|
|
||||||
// CHECK-SAME: (%[[ARG_0:.*]]: memref<8x8xf32>, %[[ARG_1:.*]]: memref<8x8xf32>, %[[ARG_2:.*]]: memref<8x8xf32>)
|
|
||||||
gpu.func @test_dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
|
|
||||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
|
|
||||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
|
||||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
|
|
||||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
|
||||||
// CHECK-COUNT-4: xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}] : memref<8x8xf32>
|
|
||||||
// CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-NOT: xegpu.create_nd_tdesc
|
|
||||||
// CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
|
|
||||||
// CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
|
|
||||||
// CHECK-SAME-COUNT-16: : vector<2x2xf32>, vector<2x2xf32> -> vector<2x2xf32>
|
|
||||||
// CHECK-NOT: xegpu.dpas
|
|
||||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<8x8xf32>
|
|
||||||
-> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
%load_a = xegpu.load_nd %tdesc_a
|
|
||||||
: !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
-> vector<8x8xf32>
|
|
||||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<8x8xf32>
|
|
||||||
-> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
%load_b = xegpu.load_nd %tdesc_b
|
|
||||||
: !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
-> vector<8x8xf32>
|
|
||||||
%tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<8x8xf32>
|
|
||||||
-> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
%dpas = xegpu.dpas %load_a, %load_b
|
|
||||||
{layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
|
|
||||||
: vector<8x8xf32>, vector<8x8xf32> -> vector<8x8xf32>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_prefetch_nd_tdesc
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
|
|
||||||
// CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}}
|
|
||||||
// CHECK-SAME-COUNT-12 : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-NOT: xegpu.prefetch_nd
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
xegpu.prefetch_nd %tdesc
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,172 +0,0 @@
|
|||||||
// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
|
|
||||||
|
|
||||||
//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
|
|
||||||
//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
|
|
||||||
gpu.module @test_1_1_assignment {
|
|
||||||
// CHECK-LABEL: test_create_nd_tdesc
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
|
|
||||||
// CHECK: %[[SGID:.*]] = gpu.subgroup_id
|
|
||||||
// CHECK: %[[C12:.*]] = arith.constant 12 : index
|
|
||||||
// CHECK: %[[C4:.*]] = arith.constant 4 : index
|
|
||||||
// CHECK: %[[C8:.*]] = arith.constant 8 : index
|
|
||||||
// CHECK: %[[DIV:.*]] = affine.apply #map()[%[[SGID]]]
|
|
||||||
// CHECK: %[[REM:.*]] = affine.apply #map1()[%[[SGID]]]
|
|
||||||
// CHECK: %[[MUL1:.*]] = index.mul %[[DIV]], %[[C12]]
|
|
||||||
// CHECK: %[[MUL2:.*]] = index.mul %[[REM]], %[[C8]]
|
|
||||||
// CHECK: %[[C24:.*]] = arith.constant 24 : index
|
|
||||||
// CHECK: %[[MOD:.*]] = index.remu %[[MUL1]], %[[C24]]
|
|
||||||
// CHECK: %[[C0:.*]] = arith.constant 0 : index
|
|
||||||
// CHECK: %[[ADD1:.*]] = index.add %[[MOD]], %[[C0]]
|
|
||||||
// CHECK: %[[C32:.*]] = arith.constant 32 : index
|
|
||||||
// CHECK: %[[MOD1:.*]] = index.remu %[[MUL2]], %[[C32]]
|
|
||||||
// CHECK: %[[C0_1:.*]] = arith.constant 0 : index
|
|
||||||
// CHECK: %[[ADD2:.*]] = index.add %[[MOD1]], %[[C0_1]]
|
|
||||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[ADD1]], %[[ADD2]]] : memref<24x32xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK: gpu.return
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_load_nd_tdesc
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
|
|
||||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK-SAME: -> vector<12x8xf32>
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%load = xegpu.load_nd %tdesc
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
-> vector<24x32xf32>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_store_nd
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_store_nd(%src: memref<24x32xf32>) {
|
|
||||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK-SAME: -> vector<12x8xf32>
|
|
||||||
// CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]]
|
|
||||||
// CHECK-SAME: : vector<12x8xf32>, !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%load = xegpu.load_nd %tdesc
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
-> vector<24x32xf32>
|
|
||||||
xegpu.store_nd %load, %tdesc
|
|
||||||
: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_update_nd
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_update_nd(%src: memref<24x32xf32>){
|
|
||||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%update = xegpu.update_nd_offset %tdesc, [0, 16]
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_dpas
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
// CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
|
|
||||||
gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
|
|
||||||
// CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
|
||||||
// CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK-SAME: -> vector<12x8xf32>
|
|
||||||
// CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[ARG_1]][{{%.*}}, {{%.*}}] : memref<32x24xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK: %[[LOAD_B:.*]] = xegpu.load_nd %[[TDESC_B]]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-SAME: -> vector<8x12xf32>
|
|
||||||
// CHECK: %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]]
|
|
||||||
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
|
|
||||||
// CHECK-SAME: : vector<12x8xf32>, vector<8x12xf32> -> vector<12x12xf32>
|
|
||||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%load_a = xegpu.load_nd %tdesc_a
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
-> vector<24x32xf32>
|
|
||||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf32>
|
|
||||||
-> !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 12], lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
%load_b = xegpu.load_nd %tdesc_b
|
|
||||||
: !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 12], lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
-> vector<32x24xf32>
|
|
||||||
%dpas = xegpu.dpas %load_a, %load_b
|
|
||||||
{layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
|
|
||||||
: vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_dpas_no_sg_data
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
// CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
|
|
||||||
gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
|
|
||||||
// CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
|
||||||
// CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK-SAME: -> vector<12x8xf32>
|
|
||||||
// CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[ARG_1]][{{%.*}}, {{%.*}}] : memref<32x24xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK: %[[LOAD_B:.*]] = xegpu.load_nd %[[TDESC_B]]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<8x12xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
// CHECK-SAME: -> vector<8x12xf32>
|
|
||||||
// CHECK: %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]]
|
|
||||||
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
|
|
||||||
// CHECK-SAME: : vector<12x8xf32>, vector<8x12xf32> -> vector<12x12xf32>
|
|
||||||
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%load_a = xegpu.load_nd %tdesc_a
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
-> vector<24x32xf32>
|
|
||||||
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf32>
|
|
||||||
-> !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
%load_b = xegpu.load_nd %tdesc_b
|
|
||||||
: !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
|
|
||||||
-> vector<32x24xf32>
|
|
||||||
%dpas = xegpu.dpas %load_a, %load_b
|
|
||||||
{layout = #xegpu.layout<sg_layout = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
|
|
||||||
: vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_prefetch_nd_tdesc
|
|
||||||
// CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
|
|
||||||
gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
|
|
||||||
// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
|
|
||||||
// CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
// CHECK: xegpu.prefetch_nd %[[TDESC]]
|
|
||||||
// CHECK-SAME: : !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
|
|
||||||
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
xegpu.prefetch_nd %tdesc
|
|
||||||
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK-LABEL: test_dpas_with_no_create_nd_desc
|
|
||||||
gpu.func @test_dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
|
|
||||||
// CHECK-NOT: vector<12x12xf32>
|
|
||||||
%dpas = xegpu.dpas %a, %b
|
|
||||||
{layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
|
|
||||||
: vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
|
|
||||||
gpu.return
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user