diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 6d21aa929571..e8d1fbf6bf40 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -1539,9 +1539,7 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou }]; - let hasFolder = 1; let hasVerifier = 1; - let hasCanonicalizer = 1; } class SizeInBits : diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 3aba0f507076..e470d1f820f7 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -1113,29 +1113,6 @@ LogicalResult ConvertLayoutOp::verify() { return mlir::success(); } -OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) { - if (getInputLayout() == getTargetLayout()) - return getSource(); - return {}; -} - -struct FoldConvertLayoutOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, - PatternRewriter &rewriter) const override { - if (op.getInputLayout() == op.getTargetLayout()) { - rewriter.replaceOp(op, op.getSource()); - return success(); - } - return failure(); - } -}; - -void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns, - MLIRContext *context) { - patterns.add(context); -} - //===----------------------------------------------------------------------===// // XeGPU_LoadMatrixOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 206f52a6c71c..b815950361b0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -77,30 +77,6 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { } } -// This pattern lowers ConvertLayoutOp by removing the inst_data field from the -// layout attributes. Since both producer and consumer operations handle data -// partitioning based on their own inst_data, while maintaining original input -// and output shape, ConvertLayoutOp does not need to manage inst_data. -struct ConvertLayoutOpPattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, - PatternRewriter &rewriter) const override { - xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr(); - xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr(); - if (inputLayout.getEffectiveInstDataAsInt().empty() || - targetLayout.getEffectiveInstDataAsInt().empty()) - return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp."); - - inputLayout = inputLayout.dropInstData(); - targetLayout = targetLayout.dropInstData(); - auto newOp = rewriter.createOrFold( - op.getLoc(), op.getType(), op.getSource(), inputLayout, targetLayout); - rewriter.replaceOp(op, newOp); - return success(); - } -}; - //===------------------------------------------------------------------------===// // The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops // to partition operations that process large shapes into multiple operations on @@ -177,6 +153,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { return getTileShape(loadGatherOp->getOpOperand(0)); } + if (auto convertLayoutOp = dyn_cast(op)) { + auto inputInstData = + convertLayoutOp.getInputLayout().getEffectiveInstDataAsInt(); + auto targetInstData = + convertLayoutOp.getTargetLayout().getEffectiveInstDataAsInt(); + // return the one with larger size + if (computeProduct(inputInstData) >= computeProduct(targetInstData)) + return inputInstData; + else + return targetInstData; + } + if (auto storeScatterOp = dyn_cast(op)) return getTileShape(storeScatterOp.getOffsets() ? storeScatterOp->getOpOperand(0) @@ -260,7 +248,16 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { std::optional> tileShape = getTileShape(result); return tileShape.has_value() && isUnrollable(result, *tileShape); }); - return hasUnrollableOperands || hasUnrollableResults; + // ConvertLayoutOp must be processed to drop the inst_data in the layout + bool isConvertLayoutWithInstData = false; + if (auto convertLayoutOp = dyn_cast(op)) { + auto targettLayout = convertLayoutOp.getTargetLayout(); + if (targettLayout && !targettLayout.getEffectiveInstDataAsInt().empty()) { + isConvertLayoutWithInstData = true; + } + } + return hasUnrollableOperands || hasUnrollableResults || + isConvertLayoutWithInstData; } void XeGPUBlockingPass::runOnOperation() { @@ -378,8 +375,6 @@ void XeGPUBlockingPass::runOnOperation() { }); RewritePatternSet patterns(ctx); - patterns.add(ctx); - vector::UnrollVectorOptions vectorOptions; vectorOptions.setNativeShapeFn(options.nativeShape); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index f05036deabe4..bf9fded8a3ab 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -2060,6 +2060,27 @@ struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern { } }; +struct ConvertLayoutDistribution + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { + auto inputLayout = op.getInputLayoutAttr(); + auto targetLayout = op.getTargetLayoutAttr(); + + if (!inputLayout || !targetLayout) + return rewriter.notifyMatchFailure(op, "missing layout attributes"); + + if (!inputLayout.isCompatibleWith(targetLayout, xegpu::LayoutKind::Lane)) { + return rewriter.notifyMatchFailure( + op, "lowering incompatible convert_layout not yet supported"); + } + rewriter.replaceOp(op, op.getSource()); + return success(); + } +}; + } // namespace namespace { @@ -2077,7 +2098,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( GpuBarrierDistribution, VectorMultiReductionDistribution, LoadDistribution, StoreDistribution, VectorTransposeDistribution, VectorBitcastDistribution, LoadMatrixDistribution, - StoreMatrixDistribution, + StoreMatrixDistribution, ConvertLayoutDistribution, MemrefExtractAlignedPointerAsIndexDistribution>( patterns.getContext(), /*pattern benefit=*/PatternHierarchy::Regular); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index 2b1bd4d73a57..d633c1531955 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -1032,15 +1032,64 @@ struct UnrollStoreMatrixOp : public UnrollPattern { } }; +/// UnrollConvertLayoutOp pattern for unrolling xegpu::ConvertLayoutOp +/// operations. It first check whether the convert layout op has valid layouts +/// after inst_data stripped. If it does, it will unroll the vector into +/// multiple smaller vectors according to the target shape, and create multiple +/// ConvertLayoutOp with the unrolled vectors and the stripped layouts. +struct UnrollConvertLayoutOp : public UnrollPattern { + using UnrollPattern::UnrollPattern; + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + VectorType valueTy = llvm::dyn_cast(op.getType()); + assert(valueTy && "the value type must be vector type!"); + + std::optional> targetShape = getTargetShape(op); + if (!targetShape || targetShape->size() != (size_t)valueTy.getRank()) + return failure(); + + xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr(); + xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr(); + if (!inputLayout || !targetLayout) + return rewriter.notifyMatchFailure(op, "missing layout attributes."); + + if (inputLayout.getEffectiveInstDataAsInt().empty() || + targetLayout.getEffectiveInstDataAsInt().empty()) + return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp."); + + inputLayout = inputLayout.dropInstData(); + targetLayout = targetLayout.dropInstData(); + + Value newSource = op.getSource(); + SmallVector newOps; + if (inputLayout && targetLayout) { + SmallVector convertedValTypes = + getUnrolledTypes(valueTy, *targetShape); + SmallVector convertedValues = + pack(op.getOperand(), convertedValTypes, *targetShape, loc, rewriter); + for (auto [v, t] : llvm::zip(convertedValues, convertedValTypes)) { + auto newOp = xegpu::ConvertLayoutOp::create(rewriter, loc, t, v, + inputLayout, targetLayout); + newOps.push_back(newOp); + } + newSource = unpack(newOps, op.getType(), *targetShape, loc, rewriter); + } + + rewriter.replaceOp(op, newSource); + return success(); + } +}; + } // namespace void mlir::xegpu::populateXeGPUUnrollPatterns( RewritePatternSet &patterns, const xegpu::UnrollOptions &options) { - patterns - .add( - patterns.getContext(), options); + patterns.add( + patterns.getContext(), options); } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 31bb6704eece..dde58ba31860 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -1189,4 +1189,20 @@ gpu.func gpu.return } + // CHECK-LABEL: gpu.func @convert_layout_removed_when_compatible( + // CHECK: %[[R:.*]] = gpu.warp_execute_on_lane_0 + // CHECK-NOT: xegpu.convert_layout + // CHECK: gpu.yield %{{.*}} : vector<16xf32> + gpu.func @convert_layout_removed_when_compatible(%laneid: index){ + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) { + %0 = "some_op"() : () -> vector<16xf32> + %1 = xegpu.convert_layout %0 + <{input_layout = #xegpu.layout, + target_layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> + : vector<16xf32> + gpu.yield %1 : vector<16xf32> + } + "some_user_op"(%r) : (vector<1xf32>) -> () + gpu.return + } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index e80a9144b967..af8615740fde 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -592,8 +592,8 @@ gpu.module @test_kernel { %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> %a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> %b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> - %e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> - %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + %a1 = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> + %c = xegpu.dpas %a1, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> gpu.return @@ -602,19 +602,53 @@ gpu.module @test_kernel { // ----- -#lb = #xegpu.layout -#b = #xegpu.layout +#in = #xegpu.slice<#xegpu.layout, dims = [1]> +#out = #xegpu.slice<#xegpu.layout, dims = [1]> +gpu.module @test_kernel { + // CHECK-LABEL: gpu.func @convert_layout_drop_inst_data_to_null + // CHECK-NOT: xegpu.convert_layout + gpu.func @convert_layout_drop_inst_data_to_null(%arg0: vector<2xf32>) -> vector<2xf32> { + %0 = xegpu.convert_layout %arg0 <{input_layout = #in, target_layout = #out}> : vector<2xf32> + gpu.return %0 : vector<2xf32> + } +} + +// ----- + +gpu.module @test_kernel { + // CHECK-LABEL: gpu.func @convert_layout_drop_slice_inst_data_to_null + // CHECK-NOT: xegpu.convert_layout + gpu.func @convert_layout_drop_slice_inst_data_to_null(%arg0: vector<1xf32>) -> vector<1xf32> { + %0 = xegpu.convert_layout %arg0 <{input_layout = #xegpu.layout, target_layout = #xegpu.slice<#xegpu.layout, dims = [1, 2]>}> : vector<1xf32> + gpu.return %0 : vector<1xf32> + } +} + +// ----- + +#lb = #xegpu.layout +#b = #xegpu.layout gpu.module @test_kernel { //CHECK: gpu.func @convert_layout([[arg0:%.+]]: vector<8x32x2xf16>) -> vector<8x32x2xf16> { //CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<8x32x2xf16> - //CHECK: [[e1:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 0, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16> - //CHECK: [[m1:%.+]] = math.exp [[e1]] {layout_result_0 = #xegpu.layout} : vector<8x16x2xf16> - //CHECK: [[r1:%.+]] = vector.insert_strided_slice [[m1]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16> - //CHECK: [[e2:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 16, 0], sizes = [8, 16, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<8x16x2xf16> - //CHECK: [[m2:%.+]] = math.exp [[e2]] {layout_result_0 = #xegpu.layout} : vector<8x16x2xf16> - //CHECK: [[r2:%.+]] = vector.insert_strided_slice [[m2]], [[r1]] {offsets = [0, 16, 0], strides = [1, 1, 1]} : vector<8x16x2xf16> into vector<8x32x2xf16> - //CHECK: gpu.return [[r2]] : vector<8x32x2xf16> + //CHECK: [[e0:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [0, 0, 0], sizes = [4, 32, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<4x32x2xf16> + //CHECK: [[e1:%.+]] = vector.extract_strided_slice [[arg0]] {offsets = [4, 0, 0], sizes = [4, 32, 2], strides = [1, 1, 1]} : vector<8x32x2xf16> to vector<4x32x2xf16> + //CHECK: [[c0:%.+]] = xegpu.convert_layout [[e0]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<4x32x2xf16> + //CHECK: [[c1:%.+]] = xegpu.convert_layout [[e1]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<4x32x2xf16> + //CHECK: [[e2:%.+]] = vector.extract_strided_slice [[c0]] {offsets = [0, 0, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16> + //CHECK: [[m0:%.+]] = math.exp [[e2]] {layout_result_0 = #xegpu.layout} : vector<4x16x2xf16> + //CHECK: [[i0:%.+]] = vector.insert_strided_slice [[m0]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16> + //CHECK: [[e3:%.+]] = vector.extract_strided_slice [[c0]] {offsets = [0, 16, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16> + //CHECK: [[m1:%.+]] = math.exp [[e3]] {layout_result_0 = #xegpu.layout} : vector<4x16x2xf16> + //CHECK: [[i1:%.+]] = vector.insert_strided_slice [[m1]], [[i0]] {offsets = [0, 16, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16> + //CHECK: [[e4:%.+]] = vector.extract_strided_slice [[c1]] {offsets = [0, 0, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16> + //CHECK: [[m2:%.+]] = math.exp [[e4]] {layout_result_0 = #xegpu.layout} : vector<4x16x2xf16> + //CHECK: [[i2:%.+]] = vector.insert_strided_slice [[m2]], [[i1]] {offsets = [4, 0, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16> + //CHECK: [[e5:%.+]] = vector.extract_strided_slice [[c1]] {offsets = [0, 16, 0], sizes = [4, 16, 2], strides = [1, 1, 1]} : vector<4x32x2xf16> to vector<4x16x2xf16> + //CHECK: [[m3:%.+]] = math.exp [[e5]] {layout_result_0 = #xegpu.layout} : vector<4x16x2xf16> + //CHECK: [[i3:%.+]] = vector.insert_strided_slice [[m3]], [[i2]] {offsets = [4, 16, 0], strides = [1, 1, 1]} : vector<4x16x2xf16> into vector<8x32x2xf16> + //CHECK: gpu.return [[i3]] : vector<8x32x2xf16> gpu.func @convert_layout(%B: vector<8x32x2xf16>) -> vector<8x32x2xf16> { %b = xegpu.convert_layout %B <{input_layout = #lb, target_layout = #b}> : vector<8x32x2xf16>