Reapply "[AMDGPU] fold memref.subview/expand_shape/collapse_shape into amdgpu.gather_to_lds" (#150334)
This is a reapply of patch #149851. The reapply also fixes a CMake/Bazel build issue, which was the reason of the revert. (Thanks @rupprecht ) Original patch (#149851) message: ----- This PR adds a new optimization pass to fold `memref.subview/expand_shape/collapse_shape` ops into consumer `amdgpu.gather_to_lds` operations. * Implements a new pass `AmdgpuFoldMemRefOpsPass` with pattern `FoldMemRefOpsIntoGatherToLDSOp` * Adds corresponding folding tests
This commit is contained in:
parent
baa19c05a3
commit
1c3e4e994b
@ -22,8 +22,9 @@ class ConversionTarget;
|
||||
namespace amdgpu {
|
||||
|
||||
#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
|
||||
#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
|
||||
#define GEN_PASS_DECL_AMDGPUFOLDMEMREFOPSPASS
|
||||
#define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
|
||||
#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
|
||||
#define GEN_PASS_REGISTRATION
|
||||
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
|
||||
|
||||
@ -38,6 +39,9 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
|
||||
void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
|
||||
PatternBenefit benefit = 1);
|
||||
|
||||
void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
|
||||
PatternBenefit benefit = 1);
|
||||
|
||||
} // namespace amdgpu
|
||||
} // namespace mlir
|
||||
|
||||
|
||||
@ -70,4 +70,16 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
|
||||
"memref::MemRefDialect"
|
||||
];
|
||||
}
|
||||
|
||||
def AmdgpuFoldMemRefOpsPass : Pass<"amdgpu-fold-memrefs-ops"> {
|
||||
let summary = "Fold memref operations into their parent operations";
|
||||
let description = [{
|
||||
This pass identifies memref operations (subview, expand_shape, collapse_shape)
|
||||
that are sources of `GatherToLDSOp` and attempts to fold the source ops,
|
||||
potentially simplifying the overall operation and improving performance.
|
||||
}];
|
||||
let dependentDialects = [
|
||||
"memref::MemRefDialect"
|
||||
];
|
||||
}
|
||||
#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
|
||||
|
||||
@ -116,6 +116,43 @@ inline bool isSameViewOrTrivialAlias(MemrefValue a, MemrefValue b) {
|
||||
/// the source memref (i.e. implements ViewLikeOpInterface).
|
||||
MemrefValue skipViewLikeOps(MemrefValue source);
|
||||
|
||||
/// Given the 'indices' of a load/store operation where the memref is a result
|
||||
/// of a expand_shape op, returns the indices w.r.t to the source memref of the
|
||||
/// expand_shape op. For example
|
||||
///
|
||||
/// %0 = ... : memref<12x42xf32>
|
||||
/// %1 = memref.expand_shape %0 [[0, 1], [2]]
|
||||
/// : memref<12x42xf32> into memref<2x6x42xf32>
|
||||
/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32
|
||||
///
|
||||
/// could be folded into
|
||||
///
|
||||
/// %2 = load %0[6 * i1 + i2, %i3] :
|
||||
/// memref<12x42xf32>
|
||||
LogicalResult resolveSourceIndicesExpandShape(
|
||||
Location loc, PatternRewriter &rewriter,
|
||||
memref::ExpandShapeOp expandShapeOp, ValueRange indices,
|
||||
SmallVectorImpl<Value> &sourceIndices, bool startsInbounds);
|
||||
|
||||
/// Given the 'indices' of a load/store operation where the memref is a result
|
||||
/// of a collapse_shape op, returns the indices w.r.t to the source memref of
|
||||
/// the collapse_shape op. For example
|
||||
///
|
||||
/// %0 = ... : memref<2x6x42xf32>
|
||||
/// %1 = memref.collapse_shape %0 [[0, 1], [2]]
|
||||
/// : memref<2x6x42xf32> into memref<12x42xf32>
|
||||
/// %2 = load %1[%i1, %i2] : memref<12x42xf32>
|
||||
///
|
||||
/// could be folded into
|
||||
///
|
||||
/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] :
|
||||
/// memref<2x6x42xf32>
|
||||
LogicalResult
|
||||
resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
|
||||
memref::CollapseShapeOp collapseShapeOp,
|
||||
ValueRange indices,
|
||||
SmallVectorImpl<Value> &sourceIndices);
|
||||
|
||||
} // namespace memref
|
||||
} // namespace mlir
|
||||
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
add_mlir_dialect_library(MLIRAMDGPUTransforms
|
||||
EmulateAtomics.cpp
|
||||
ResolveStridedMetadata.cpp
|
||||
FoldMemRefsOps.cpp
|
||||
MaskedloadToLoad.cpp
|
||||
ResolveStridedMetadata.cpp
|
||||
|
||||
ADDITIONAL_HEADER_DIRS
|
||||
{$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
|
||||
@ -12,6 +13,7 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRAMDGPUDialect
|
||||
MLIRAMDGPUUtils
|
||||
MLIRAffineUtils
|
||||
MLIRArithDialect
|
||||
MLIRMemRefDialect
|
||||
MLIRSCFDialect
|
||||
|
||||
97
mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
Normal file
97
mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
Normal file
@ -0,0 +1,97 @@
|
||||
//===- FoldSubviewOps.cpp - AMDGPU fold subview ops -----------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
|
||||
|
||||
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
|
||||
#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
|
||||
#include "mlir/Transforms/WalkPatternRewriteDriver.h"
|
||||
#include "llvm/ADT/TypeSwitch.h"
|
||||
|
||||
namespace mlir::amdgpu {
|
||||
#define GEN_PASS_DEF_AMDGPUFOLDMEMREFOPSPASS
|
||||
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
|
||||
|
||||
struct AmdgpuFoldMemRefOpsPass final
|
||||
: amdgpu::impl::AmdgpuFoldMemRefOpsPassBase<AmdgpuFoldMemRefOpsPass> {
|
||||
void runOnOperation() override {
|
||||
RewritePatternSet patterns(&getContext());
|
||||
populateAmdgpuFoldMemRefOpsPatterns(patterns);
|
||||
walkAndApplyPatterns(getOperation(), std::move(patterns));
|
||||
}
|
||||
};
|
||||
|
||||
struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
|
||||
using OpRewritePattern::OpRewritePattern;
|
||||
LogicalResult matchAndRewrite(GatherToLDSOp op,
|
||||
PatternRewriter &rewriter) const override {
|
||||
Location loc = op.getLoc();
|
||||
|
||||
Value memrefSource;
|
||||
SmallVector<Value> sourceIndices;
|
||||
auto foldResult =
|
||||
llvm::TypeSwitch<Operation *, LogicalResult>(
|
||||
op.getSrc().getDefiningOp())
|
||||
.Case<memref::SubViewOp>([&](memref::SubViewOp subviewOp) {
|
||||
// If the source is a SubViewOp, we can directly rewrite the
|
||||
// GatherToLDSOp.
|
||||
mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
|
||||
rewriter, loc, subviewOp.getMixedOffsets(),
|
||||
subviewOp.getMixedStrides(), subviewOp.getDroppedDims(),
|
||||
op.getSrcIndices(), sourceIndices);
|
||||
memrefSource = subviewOp.getSource();
|
||||
return success();
|
||||
})
|
||||
.Case<memref::ExpandShapeOp>(
|
||||
[&](memref::ExpandShapeOp expandShapeOp) {
|
||||
if (failed(mlir::memref::resolveSourceIndicesExpandShape(
|
||||
loc, rewriter, expandShapeOp, op.getSrcIndices(),
|
||||
sourceIndices, false))) {
|
||||
return failure();
|
||||
}
|
||||
memrefSource = expandShapeOp.getViewSource();
|
||||
return success();
|
||||
})
|
||||
.Case<memref::CollapseShapeOp>(
|
||||
[&](memref::CollapseShapeOp collapseShapeOp) {
|
||||
if (failed(mlir::memref::resolveSourceIndicesCollapseShape(
|
||||
loc, rewriter, collapseShapeOp, op.getSrcIndices(),
|
||||
sourceIndices))) {
|
||||
return failure();
|
||||
}
|
||||
memrefSource = collapseShapeOp.getViewSource();
|
||||
return success();
|
||||
})
|
||||
.Default([&](Operation *op) {
|
||||
// If the source is not a SubViewOp, ExpandShapeOp, or
|
||||
// CollapseShapeOp, we cannot fold the GatherToLDSOp.
|
||||
return rewriter.notifyMatchFailure(
|
||||
op,
|
||||
"source producer is not one of SubViewOp, ExpandShapeOp, or "
|
||||
"CollapseShapeOp");
|
||||
});
|
||||
|
||||
if (failed(foldResult)) {
|
||||
return failure();
|
||||
}
|
||||
|
||||
rewriter.replaceOpWithNewOp<GatherToLDSOp>(op, memrefSource, sourceIndices,
|
||||
op.getDst(), op.getDstIndices(),
|
||||
op.getTransferType());
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
|
||||
PatternBenefit benefit) {
|
||||
patterns.add<FoldMemRefOpsIntoGatherToLDSOp>(patterns.getContext(), benefit);
|
||||
}
|
||||
} // namespace mlir::amdgpu
|
||||
@ -44,97 +44,6 @@ using namespace mlir;
|
||||
// Utility functions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// Given the 'indices' of a load/store operation where the memref is a result
|
||||
/// of a expand_shape op, returns the indices w.r.t to the source memref of the
|
||||
/// expand_shape op. For example
|
||||
///
|
||||
/// %0 = ... : memref<12x42xf32>
|
||||
/// %1 = memref.expand_shape %0 [[0, 1], [2]]
|
||||
/// : memref<12x42xf32> into memref<2x6x42xf32>
|
||||
/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32
|
||||
///
|
||||
/// could be folded into
|
||||
///
|
||||
/// %2 = load %0[6 * i1 + i2, %i3] :
|
||||
/// memref<12x42xf32>
|
||||
static LogicalResult resolveSourceIndicesExpandShape(
|
||||
Location loc, PatternRewriter &rewriter,
|
||||
memref::ExpandShapeOp expandShapeOp, ValueRange indices,
|
||||
SmallVectorImpl<Value> &sourceIndices, bool startsInbounds) {
|
||||
SmallVector<OpFoldResult> destShape = expandShapeOp.getMixedOutputShape();
|
||||
|
||||
// Traverse all reassociation groups to determine the appropriate indices
|
||||
// corresponding to each one of them post op folding.
|
||||
for (ArrayRef<int64_t> group : expandShapeOp.getReassociationIndices()) {
|
||||
assert(!group.empty() && "association indices groups cannot be empty");
|
||||
int64_t groupSize = group.size();
|
||||
if (groupSize == 1) {
|
||||
sourceIndices.push_back(indices[group[0]]);
|
||||
continue;
|
||||
}
|
||||
SmallVector<OpFoldResult> groupBasis =
|
||||
llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; });
|
||||
SmallVector<Value> groupIndices =
|
||||
llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; });
|
||||
Value collapsedIndex = rewriter.create<affine::AffineLinearizeIndexOp>(
|
||||
loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds);
|
||||
sourceIndices.push_back(collapsedIndex);
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Given the 'indices' of a load/store operation where the memref is a result
|
||||
/// of a collapse_shape op, returns the indices w.r.t to the source memref of
|
||||
/// the collapse_shape op. For example
|
||||
///
|
||||
/// %0 = ... : memref<2x6x42xf32>
|
||||
/// %1 = memref.collapse_shape %0 [[0, 1], [2]]
|
||||
/// : memref<2x6x42xf32> into memref<12x42xf32>
|
||||
/// %2 = load %1[%i1, %i2] : memref<12x42xf32>
|
||||
///
|
||||
/// could be folded into
|
||||
///
|
||||
/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] :
|
||||
/// memref<2x6x42xf32>
|
||||
static LogicalResult
|
||||
resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
|
||||
memref::CollapseShapeOp collapseShapeOp,
|
||||
ValueRange indices,
|
||||
SmallVectorImpl<Value> &sourceIndices) {
|
||||
// Note: collapse_shape requires a strided memref, we can do this.
|
||||
auto metadata = rewriter.create<memref::ExtractStridedMetadataOp>(
|
||||
loc, collapseShapeOp.getSrc());
|
||||
SmallVector<OpFoldResult> sourceSizes = metadata.getConstifiedMixedSizes();
|
||||
for (auto [index, group] :
|
||||
llvm::zip(indices, collapseShapeOp.getReassociationIndices())) {
|
||||
assert(!group.empty() && "association indices groups cannot be empty");
|
||||
int64_t groupSize = group.size();
|
||||
|
||||
if (groupSize == 1) {
|
||||
sourceIndices.push_back(index);
|
||||
continue;
|
||||
}
|
||||
|
||||
SmallVector<OpFoldResult> basis =
|
||||
llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; });
|
||||
auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
|
||||
loc, index, basis, /*hasOuterBound=*/true);
|
||||
llvm::append_range(sourceIndices, delinearize.getResults());
|
||||
}
|
||||
if (collapseShapeOp.getReassociationIndices().empty()) {
|
||||
auto zeroAffineMap = rewriter.getConstantAffineMap(0);
|
||||
int64_t srcRank =
|
||||
cast<MemRefType>(collapseShapeOp.getViewSource().getType()).getRank();
|
||||
OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
|
||||
rewriter, loc, zeroAffineMap, ArrayRef<OpFoldResult>{});
|
||||
for (int64_t i = 0; i < srcRank; i++) {
|
||||
sourceIndices.push_back(
|
||||
getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
|
||||
}
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
/// Helpers to access the memref operand for each op.
|
||||
template <typename LoadOrStoreOpTy>
|
||||
static Value getMemRefOperand(LoadOrStoreOpTy op) {
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
|
||||
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Arith/Utils/Utils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Interfaces/ViewLikeInterface.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
@ -217,5 +218,70 @@ MemrefValue skipViewLikeOps(MemrefValue source) {
|
||||
return source;
|
||||
}
|
||||
|
||||
LogicalResult resolveSourceIndicesExpandShape(
|
||||
Location loc, PatternRewriter &rewriter,
|
||||
memref::ExpandShapeOp expandShapeOp, ValueRange indices,
|
||||
SmallVectorImpl<Value> &sourceIndices, bool startsInbounds) {
|
||||
SmallVector<OpFoldResult> destShape = expandShapeOp.getMixedOutputShape();
|
||||
|
||||
// Traverse all reassociation groups to determine the appropriate indices
|
||||
// corresponding to each one of them post op folding.
|
||||
for (ArrayRef<int64_t> group : expandShapeOp.getReassociationIndices()) {
|
||||
assert(!group.empty() && "association indices groups cannot be empty");
|
||||
int64_t groupSize = group.size();
|
||||
if (groupSize == 1) {
|
||||
sourceIndices.push_back(indices[group[0]]);
|
||||
continue;
|
||||
}
|
||||
SmallVector<OpFoldResult> groupBasis =
|
||||
llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; });
|
||||
SmallVector<Value> groupIndices =
|
||||
llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; });
|
||||
Value collapsedIndex = rewriter.create<affine::AffineLinearizeIndexOp>(
|
||||
loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds);
|
||||
sourceIndices.push_back(collapsedIndex);
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult
|
||||
resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
|
||||
memref::CollapseShapeOp collapseShapeOp,
|
||||
ValueRange indices,
|
||||
SmallVectorImpl<Value> &sourceIndices) {
|
||||
// Note: collapse_shape requires a strided memref, we can do this.
|
||||
auto metadata = rewriter.create<memref::ExtractStridedMetadataOp>(
|
||||
loc, collapseShapeOp.getSrc());
|
||||
SmallVector<OpFoldResult> sourceSizes = metadata.getConstifiedMixedSizes();
|
||||
for (auto [index, group] :
|
||||
llvm::zip(indices, collapseShapeOp.getReassociationIndices())) {
|
||||
assert(!group.empty() && "association indices groups cannot be empty");
|
||||
int64_t groupSize = group.size();
|
||||
|
||||
if (groupSize == 1) {
|
||||
sourceIndices.push_back(index);
|
||||
continue;
|
||||
}
|
||||
|
||||
SmallVector<OpFoldResult> basis =
|
||||
llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; });
|
||||
auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
|
||||
loc, index, basis, /*hasOuterBound=*/true);
|
||||
llvm::append_range(sourceIndices, delinearize.getResults());
|
||||
}
|
||||
if (collapseShapeOp.getReassociationIndices().empty()) {
|
||||
auto zeroAffineMap = rewriter.getConstantAffineMap(0);
|
||||
int64_t srcRank =
|
||||
cast<MemRefType>(collapseShapeOp.getViewSource().getType()).getRank();
|
||||
OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
|
||||
rewriter, loc, zeroAffineMap, ArrayRef<OpFoldResult>{});
|
||||
for (int64_t i = 0; i < srcRank; i++) {
|
||||
sourceIndices.push_back(
|
||||
getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
|
||||
}
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
} // namespace memref
|
||||
} // namespace mlir
|
||||
|
||||
94
mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
Normal file
94
mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
Normal file
@ -0,0 +1,94 @@
|
||||
// RUN: mlir-opt --amdgpu-fold-memrefs-ops --split-input-file %s | FileCheck %s
|
||||
|
||||
#gpu_lds_addrspace = 3
|
||||
|
||||
// CHECK: func @test_subview_folding
|
||||
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
|
||||
func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
|
||||
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
|
||||
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
|
||||
// CHECK: %[[C0:.*]] = arith.constant 0 : index
|
||||
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
|
||||
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
|
||||
|
||||
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
|
||||
%mem = memref.alloc() : memref<64x128xf16>
|
||||
%subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1]>>
|
||||
%c0 = arith.constant 0 : index
|
||||
amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
|
||||
: vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
|
||||
func.return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#gpu_lds_addrspace = 3
|
||||
|
||||
// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
|
||||
// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
|
||||
|
||||
// CHECK: func @subview_folding_offset
|
||||
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
|
||||
func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
|
||||
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
|
||||
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
|
||||
// CHECK: %[[C0:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
|
||||
// CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
|
||||
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
|
||||
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
|
||||
|
||||
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
|
||||
%mem = memref.alloc() : memref<64x128xf16>
|
||||
%subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1], offset: 4160>>
|
||||
%c0 = arith.constant 0 : index
|
||||
amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
|
||||
: vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace>
|
||||
func.return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#gpu_lds_addrspace = 3
|
||||
|
||||
// CHECK: func @test_expand_shape
|
||||
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
|
||||
func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
|
||||
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
|
||||
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
|
||||
// CHECK: %[[C0:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
|
||||
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]]
|
||||
// CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3>
|
||||
|
||||
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
|
||||
%mem = memref.alloc() : memref<8192xf16>
|
||||
%expand = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16>
|
||||
%c0 = arith.constant 0 : index
|
||||
amdgpu.gather_to_lds %expand[%offset_i, %offset_j], %alloc[%c0, %c0]
|
||||
: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu_lds_addrspace>
|
||||
func.return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#gpu_lds_addrspace = 3
|
||||
|
||||
// CHECK: func @test_collapse_shape
|
||||
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
|
||||
func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
|
||||
// CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
|
||||
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
|
||||
// CHECK: %[[C0:.*]] = arith.constant 0 : index
|
||||
// CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
|
||||
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
|
||||
// CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
|
||||
|
||||
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
|
||||
%mem = memref.alloc() : memref<64x128xf16>
|
||||
%collapse = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16>
|
||||
%c0 = arith.constant 0 : index
|
||||
amdgpu.gather_to_lds %collapse[%offset_i], %alloc[%c0, %c0]
|
||||
: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, #gpu_lds_addrspace>
|
||||
func.return
|
||||
}
|
||||
@ -1657,6 +1657,7 @@ cc_library(
|
||||
":AMDGPUPassIncGen",
|
||||
":AMDGPUUtils",
|
||||
":AffineDialect",
|
||||
":AffineUtils",
|
||||
":ArithDialect",
|
||||
":ArithUtils",
|
||||
":ControlFlowDialect",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user