llvm-project/mlir/test/Dialect/Affine/parallelize.mlir
Uday Bondhugula 5a99b776eb [MLIR] Extend isLoopMemoryParallel to account for locally allocated memrefs
Extend isLoopMemoryParallel check to include locally allocated memrefs.
This strengthens and also speeds up the dependence check used by the
utility by excluding locally allocated memrefs where appropriate.

Additional memref dialect ops can be supported exhaustively via proper
interfaces.

Reviewed By: dcaballe

Differential Revision: https://reviews.llvm.org/D120617
2022-03-04 09:16:28 +05:30

326 lines
11 KiB
MLIR

// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize | FileCheck %s
// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='max-nested=1' | FileCheck --check-prefix=MAX-NESTED %s
// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='parallel-reductions=1' | FileCheck --check-prefix=REDUCE %s
// CHECK-LABEL: func @reduce_window_max() {
func @reduce_window_max() {
%cst = arith.constant 0.000000e+00 : f32
%0 = memref.alloc() : memref<1x8x8x64xf32>
%1 = memref.alloc() : memref<1x18x18x64xf32>
affine.for %arg0 = 0 to 1 {
affine.for %arg1 = 0 to 8 {
affine.for %arg2 = 0 to 8 {
affine.for %arg3 = 0 to 64 {
affine.store %cst, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
}
}
}
}
affine.for %arg0 = 0 to 1 {
affine.for %arg1 = 0 to 8 {
affine.for %arg2 = 0 to 8 {
affine.for %arg3 = 0 to 64 {
affine.for %arg4 = 0 to 1 {
affine.for %arg5 = 0 to 3 {
affine.for %arg6 = 0 to 3 {
affine.for %arg7 = 0 to 1 {
%2 = affine.load %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
%3 = affine.load %1[%arg0 + %arg4, %arg1 * 2 + %arg5, %arg2 * 2 + %arg6, %arg3 + %arg7] : memref<1x18x18x64xf32>
%4 = arith.cmpf ogt, %2, %3 : f32
%5 = arith.select %4, %2, %3 : f32
affine.store %5, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
}
}
}
}
}
}
}
}
return
}
// CHECK: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[v0:.*]] = memref.alloc() : memref<1x8x8x64xf32>
// CHECK: %[[v1:.*]] = memref.alloc() : memref<1x18x18x64xf32>
// CHECK: affine.parallel (%[[arg0:.*]]) = (0) to (1) {
// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (8) {
// CHECK: affine.parallel (%[[arg2:.*]]) = (0) to (8) {
// CHECK: affine.parallel (%[[arg3:.*]]) = (0) to (64) {
// CHECK: affine.store %[[cst]], %[[v0]][%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]]] : memref<1x8x8x64xf32>
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: affine.parallel (%[[a0:.*]]) = (0) to (1) {
// CHECK: affine.parallel (%[[a1:.*]]) = (0) to (8) {
// CHECK: affine.parallel (%[[a2:.*]]) = (0) to (8) {
// CHECK: affine.parallel (%[[a3:.*]]) = (0) to (64) {
// CHECK: affine.parallel (%[[a4:.*]]) = (0) to (1) {
// CHECK: affine.for %[[a5:.*]] = 0 to 3 {
// CHECK: affine.for %[[a6:.*]] = 0 to 3 {
// CHECK: affine.parallel (%[[a7:.*]]) = (0) to (1) {
// CHECK: %[[lhs:.*]] = affine.load %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
// CHECK: %[[rhs:.*]] = affine.load %[[v1]][%[[a0]] + %[[a4]], %[[a1]] * 2 + %[[a5]], %[[a2]] * 2 + %[[a6]], %[[a3]] + %[[a7]]] : memref<1x18x18x64xf32>
// CHECK: %[[res:.*]] = arith.cmpf ogt, %[[lhs]], %[[rhs]] : f32
// CHECK: %[[sel:.*]] = arith.select %[[res]], %[[lhs]], %[[rhs]] : f32
// CHECK: affine.store %[[sel]], %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
// CHECK: }
func @loop_nest_3d_outer_two_parallel(%N : index) {
%0 = memref.alloc() : memref<1024 x 1024 x vector<64xf32>>
%1 = memref.alloc() : memref<1024 x 1024 x vector<64xf32>>
%2 = memref.alloc() : memref<1024 x 1024 x vector<64xf32>>
affine.for %i = 0 to %N {
affine.for %j = 0 to %N {
%7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>>
affine.for %k = 0 to %N {
%5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>>
%6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>>
%8 = arith.mulf %5, %6 : vector<64xf32>
%9 = arith.addf %7, %8 : vector<64xf32>
affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>>
}
}
}
return
}
// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (symbol(%arg0)) {
// CHECK-NEXT: affine.parallel (%[[arg2:.*]]) = (0) to (symbol(%arg0)) {
// CHECK: affine.for %[[arg3:.*]] = 0 to %arg0 {
// CHECK-LABEL: unknown_op_conservative
func @unknown_op_conservative() {
affine.for %i = 0 to 10 {
// CHECK: affine.for %[[arg1:.*]] = 0 to 10 {
"unknown"() : () -> ()
}
return
}
// CHECK-LABEL: non_affine_load
func @non_affine_load() {
%0 = memref.alloc() : memref<100 x f32>
affine.for %i = 0 to 100 {
// CHECK: affine.for %{{.*}} = 0 to 100 {
memref.load %0[%i] : memref<100 x f32>
}
return
}
// CHECK-LABEL: for_with_minmax
func @for_with_minmax(%m: memref<?xf32>, %lb0: index, %lb1: index,
%ub0: index, %ub1: index) {
// CHECK: affine.parallel (%{{.*}}) = (max(%{{.*}}, %{{.*}})) to (min(%{{.*}}, %{{.*}}))
affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %lb1)
to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
affine.load %m[%i] : memref<?xf32>
}
return
}
// CHECK-LABEL: nested_for_with_minmax
func @nested_for_with_minmax(%m: memref<?xf32>, %lb0: index,
%ub0: index, %ub1: index) {
// CHECK: affine.parallel (%[[I:.*]]) =
affine.for %j = 0 to 10 {
// CHECK: affine.parallel (%{{.*}}) = (max(%{{.*}}, %[[I]])) to (min(%{{.*}}, %{{.*}}))
affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %j)
to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
affine.load %m[%i] : memref<?xf32>
}
}
return
}
// MAX-NESTED-LABEL: @max_nested
func @max_nested(%m: memref<?x?xf32>, %lb0: index, %lb1: index,
%ub0: index, %ub1: index) {
// MAX-NESTED: affine.parallel
affine.for %i = affine_map<(d0) -> (d0)>(%lb0) to affine_map<(d0) -> (d0)>(%ub0) {
// MAX-NESTED: affine.for
affine.for %j = affine_map<(d0) -> (d0)>(%lb1) to affine_map<(d0) -> (d0)>(%ub1) {
affine.load %m[%i, %j] : memref<?x?xf32>
}
}
return
}
// MAX-NESTED-LABEL: @max_nested_1
func @max_nested_1(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) {
%0 = memref.alloc() : memref<4096x4096xf32>
// MAX-NESTED: affine.parallel
affine.for %arg3 = 0 to 4096 {
// MAX-NESTED-NEXT: affine.for
affine.for %arg4 = 0 to 4096 {
// MAX-NESTED-NEXT: affine.for
affine.for %arg5 = 0 to 4096 {
%1 = affine.load %arg0[%arg3, %arg5] : memref<4096x4096xf32>
%2 = affine.load %arg1[%arg5, %arg4] : memref<4096x4096xf32>
%3 = affine.load %0[%arg3, %arg4] : memref<4096x4096xf32>
%4 = arith.mulf %1, %2 : f32
%5 = arith.addf %3, %4 : f32
affine.store %5, %0[%arg3, %arg4] : memref<4096x4096xf32>
}
}
}
return
}
// CHECK-LABEL: @iter_args
// REDUCE-LABEL: @iter_args
func @iter_args(%in: memref<10xf32>) {
// REDUCE: %[[init:.*]] = arith.constant
%cst = arith.constant 0.000000e+00 : f32
// CHECK-NOT: affine.parallel
// REDUCE: %[[reduced:.*]] = affine.parallel (%{{.*}}) = (0) to (10) reduce ("addf")
%final_red = affine.for %i = 0 to 10 iter_args(%red_iter = %cst) -> (f32) {
// REDUCE: %[[red_value:.*]] = affine.load
%ld = affine.load %in[%i] : memref<10xf32>
// REDUCE-NOT: arith.addf
%add = arith.addf %red_iter, %ld : f32
// REDUCE: affine.yield %[[red_value]]
affine.yield %add : f32
}
// REDUCE: arith.addf %[[init]], %[[reduced]]
return
}
// CHECK-LABEL: @nested_iter_args
// REDUCE-LABEL: @nested_iter_args
func @nested_iter_args(%in: memref<20x10xf32>) {
%cst = arith.constant 0.000000e+00 : f32
// CHECK: affine.parallel
affine.for %i = 0 to 20 {
// CHECK-NOT: affine.parallel
// REDUCE: affine.parallel
// REDUCE: reduce ("addf")
%final_red = affine.for %j = 0 to 10 iter_args(%red_iter = %cst) -> (f32) {
%ld = affine.load %in[%i, %j] : memref<20x10xf32>
%add = arith.addf %red_iter, %ld : f32
affine.yield %add : f32
}
}
return
}
// REDUCE-LABEL: @strange_butterfly
func @strange_butterfly() {
%cst1 = arith.constant 0.0 : f32
%cst2 = arith.constant 1.0 : f32
// REDUCE-NOT: affine.parallel
affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) {
%0 = arith.addf %it1, %it2 : f32
affine.yield %0, %0 : f32, f32
}
return
}
// An iter arg is used more than once. This is not a simple reduction and
// should not be parallelized.
// REDUCE-LABEL: @repeated_use
func @repeated_use() {
%cst1 = arith.constant 0.0 : f32
// REDUCE-NOT: affine.parallel
affine.for %i = 0 to 10 iter_args(%it1 = %cst1) -> (f32) {
%0 = arith.addf %it1, %it1 : f32
affine.yield %0 : f32
}
return
}
// An iter arg is used in the chain of operations defining the value being
// reduced, this is not a simple reduction and should not be parallelized.
// REDUCE-LABEL: @use_in_backward_slice
func @use_in_backward_slice() {
%cst1 = arith.constant 0.0 : f32
%cst2 = arith.constant 1.0 : f32
// REDUCE-NOT: affine.parallel
affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) {
%0 = "test.some_modification"(%it2) : (f32) -> f32
%1 = arith.addf %it1, %0 : f32
affine.yield %1, %1 : f32, f32
}
return
}
// REDUCE-LABEL: @nested_min_max
// CHECK-LABEL: @nested_min_max
// CHECK: (%{{.*}}, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[UB1:.*]]: index)
func @nested_min_max(%m: memref<?xf32>, %lb0: index,
%ub0: index, %ub1: index) {
// CHECK: affine.parallel (%[[J:.*]]) =
affine.for %j = 0 to 10 {
// CHECK: affine.parallel (%{{.*}}) = (max(%[[LB0]], %[[J]]))
// CHECK: to (min(%[[UB0]], %[[UB1]]))
affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %j)
to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
affine.load %m[%i] : memref<?xf32>
}
}
return
}
// Test in the presence of locally allocated memrefs.
// CHECK: func @local_alloc
func @local_alloc() {
%cst = arith.constant 0.0 : f32
affine.for %i = 0 to 100 {
%m = memref.alloc() : memref<1xf32>
%ma = memref.alloca() : memref<1xf32>
affine.store %cst, %m[0] : memref<1xf32>
}
// CHECK: affine.parallel
return
}
// CHECK: func @local_alloc_cast
func @local_alloc_cast() {
%cst = arith.constant 0.0 : f32
affine.for %i = 0 to 100 {
%m = memref.alloc() : memref<128xf32>
affine.for %j = 0 to 128 {
affine.store %cst, %m[%j] : memref<128xf32>
}
affine.for %j = 0 to 128 {
affine.store %cst, %m[0] : memref<128xf32>
}
%r = memref.reinterpret_cast %m to offset: [0], sizes: [8, 16],
strides: [16, 1] : memref<128xf32> to memref<8x16xf32>
affine.for %j = 0 to 8 {
affine.store %cst, %r[%j, %j] : memref<8x16xf32>
}
}
// CHECK: affine.parallel
// CHECK: affine.parallel
// CHECK: }
// CHECK: affine.for
// CHECK: }
// CHECK: affine.parallel
// CHECK: }
// CHECK: }
return
}
// CHECK-LABEL: @iter_arg_memrefs
func @iter_arg_memrefs(%in: memref<10xf32>) {
%mi = memref.alloc() : memref<f32>
// Loop-carried memrefs are treated as serializing the loop.
// CHECK: affine.for
%mo = affine.for %i = 0 to 10 iter_args(%m_arg = %mi) -> (memref<f32>) {
affine.yield %m_arg : memref<f32>
}
return
}