From 15e7177f088eeb5aa7be5d30632e7b5e815fa8aa Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Sun, 8 Mar 2026 18:46:54 -0400 Subject: [PATCH] [mlir][GPU] Fix double spaces in tests after ODS printer fix. NFC. (#185325) Follow-up to #184253. The ODS attr/type printer fix removed the leading space from generated print() methods. Update tests that checked for the old double-space output of GPU ops using GPU_DimensionAttr and GPU_MmaElementwiseOpAttr. Co-authored-by: Claude Opus 4.6 --- .../VectorToGPU/vector-to-mma-ops.mlir | 12 +-- mlir/test/Dialect/Affine/ops.mlir | 8 +- .../GPU/subgroup-mma-vector-unroll.mlir | 8 +- mlir/test/Dialect/GPU/subgroupId-rewrite.mlir | 10 +-- mlir/test/Dialect/GPU/transform-gpu.mlir | 76 +++++++++---------- .../Dialect/SparseTensor/GPU/gpu_matmul.mlir | 8 +- .../Dialect/SparseTensor/GPU/gpu_matvec.mlir | 8 +- .../Vector/vector-warp-distribute.mlir | 4 +- .../GPU/CUDA/sm90/cga_cluster.mlir | 18 ++--- .../sm90/gemm_f32_f16_f16_128x128x128.mlir | 2 +- .../gemm_pred_f32_f16_f16_128x128x128.mlir | 2 +- .../tma_load_128x128_stride_noswizzle.mlir | 14 ++-- .../sm90/tma_load_128x64_swizzle128b.mlir | 4 +- .../CUDA/sm90/tma_load_64x64_swizzle128b.mlir | 4 +- .../sm90/tma_load_64x8_8x128_noswizzle.mlir | 4 +- ...a_load_64x8_8x128_noswizzle-transform.mlir | 2 +- .../GPU/LevelZero/gpu-addf32-to-spirv.mlir | 6 +- .../GPU/LevelZero/gpu-addi64-to-spirv.mlir | 4 +- .../LevelZero/gpu-memcpy-addf32-to-spirv.mlir | 6 +- .../GPU/LevelZero/gpu-reluf32-to-spirv.mlir | 4 +- .../GPU/SYCL/gpu-addf32-to-spirv.mlir | 6 +- .../GPU/SYCL/gpu-addi64-to-spirv.mlir | 4 +- .../GPU/SYCL/gpu-memcpy-addf32-to-spirv.mlir | 6 +- .../GPU/SYCL/gpu-reluf32-to-spirv.mlir | 4 +- mlir/test/python/dialects/gpu/dialect.py | 8 +- 25 files changed, 116 insertions(+), 116 deletions(-) diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir index 32065035b6f2..5b316f3e9e21 100644 --- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir +++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir @@ -463,7 +463,7 @@ func.func @matmul_mixed_signedness_int8(%arg0: memref<16x32xi8>, %arg1: memref<1 // CHECK-LABEL: func @cast_f16_to_f32_write // CHECK: %[[COMPUTE:.+]] = gpu.subgroup_mma_compute -// CHECK: %[[EXT:.+]] = gpu.subgroup_mma_elementwise extf %[[COMPUTE]] : (!gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> +// CHECK: %[[EXT:.+]] = gpu.subgroup_mma_elementwise extf %[[COMPUTE]] : (!gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> // CHECK: gpu.subgroup_mma_store_matrix %[[EXT]] func.func @cast_f16_to_f32_write(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>, %arg3: memref<16x16xf32>) { %c0 = arith.constant 0 : index @@ -485,7 +485,7 @@ func.func @cast_f16_to_f32_write(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf // CHECK-LABEL: func @cast_f32_to_f16_write // CHECK: %[[COMPUTE:.+]] = gpu.subgroup_mma_compute -// CHECK: %[[EXT:.+]] = gpu.subgroup_mma_elementwise truncf %[[COMPUTE]] : (!gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> +// CHECK: %[[EXT:.+]] = gpu.subgroup_mma_elementwise truncf %[[COMPUTE]] : (!gpu.mma_matrix<16x16xf32, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> // CHECK: gpu.subgroup_mma_store_matrix %[[EXT]] func.func @cast_f32_to_f16_write(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf32>, %arg2: memref<16x16xf32>, %arg3: memref<16x16xf16>) { %c0 = arith.constant 0 : index @@ -536,10 +536,10 @@ func.func @fold_transpose_into_transfer_read(%alloc: memref<64x128xf16>, %vector // CHECK-LABEL: func @cast_f16_to_f32_read // CHECK: %[[A:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> // CHECK: %[[C:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> -// CHECK: %[[AE:.+]] = gpu.subgroup_mma_elementwise extf %[[A]] : (!gpu.mma_matrix<16x16xf16, "AOp">) -> !gpu.mma_matrix<16x16xf32, "AOp"> -// CHECK: %[[CE:.+]] = gpu.subgroup_mma_elementwise extf %[[C]] : (!gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> +// CHECK: %[[AE:.+]] = gpu.subgroup_mma_elementwise extf %[[A]] : (!gpu.mma_matrix<16x16xf16, "AOp">) -> !gpu.mma_matrix<16x16xf32, "AOp"> +// CHECK: %[[CE:.+]] = gpu.subgroup_mma_elementwise extf %[[C]] : (!gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> // CHECK: %[[B:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index, transpose} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> -// CHECK: %[[BE:.+]] = gpu.subgroup_mma_elementwise extf %[[B]] : (!gpu.mma_matrix<16x16xf16, "BOp">) -> !gpu.mma_matrix<16x16xf32, "BOp"> +// CHECK: %[[BE:.+]] = gpu.subgroup_mma_elementwise extf %[[B]] : (!gpu.mma_matrix<16x16xf16, "BOp">) -> !gpu.mma_matrix<16x16xf32, "BOp"> // CHECK: gpu.subgroup_mma_compute %[[AE]], %[[BE]], %[[CE]] func.func @cast_f16_to_f32_read(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>, %arg3: memref<16x16xf32>) { %c0 = arith.constant 0 : index @@ -582,7 +582,7 @@ func.func @test_unsupported(%arg0: vector<4x4xi32>, %arg1: vector<4x4xi32>, %arg // CHECK-LABEL: func @addf // CHECK: %[[A:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> // CHECK: %[[B:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index, transpose} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> -// CHECK: %[[C:.+]] = gpu.subgroup_mma_elementwise addf %[[A]], %[[B]] : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> +// CHECK: %[[C:.+]] = gpu.subgroup_mma_elementwise addf %[[A]], %[[B]] : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> // CHECK: gpu.subgroup_mma_store_matrix %[[C]] func.func @addf(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) { %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/Affine/ops.mlir b/mlir/test/Dialect/Affine/ops.mlir index 8a3f41d1d9b0..0992d392bcd1 100644 --- a/mlir/test/Dialect/Affine/ops.mlir +++ b/mlir/test/Dialect/Affine/ops.mlir @@ -328,7 +328,7 @@ module { %c1 = arith.constant 1 : index gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) { - %thread_id_x = gpu.thread_id x + %thread_id_x = gpu.thread_id x %c128 = arith.constant 128 : index affine.for %arg12 = %thread_id_x to %c128 step 8 { } @@ -338,7 +338,7 @@ module { } } -// CHECK: %[[THREAD_ID:.*]] = gpu.thread_id x +// CHECK: %[[THREAD_ID:.*]] = gpu.thread_id x // CHECK: %[[VAL:.*]] = arith.constant 128 : index // CHECK: affine.for %{{.*}} = %[[THREAD_ID]] to %[[VAL]] step 8 { @@ -357,7 +357,7 @@ module { %dim = memref.dim %arg0, %c3 : memref %c0 = arith.constant 0 : index affine.for %arg3 = %c0 to %dim step 32 { - %thread_id_x = gpu.thread_id x + %thread_id_x = gpu.thread_id x %0 = affine.apply #map()[%thread_id_x] %c128 = arith.constant 128 : index affine.for %arg4 = %0 to %c128 step 8 { @@ -374,7 +374,7 @@ module { // CHECK: %[[VAL_2:.*]] = memref.dim %[[VAL_0]], %[[VAL_1]] : memref // CHECK: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK: affine.for %[[VAL_4:.*]] = %[[VAL_3]] to %[[VAL_2]] step 32 { -// CHECK: %[[VAL_5:.*]] = gpu.thread_id x +// CHECK: %[[VAL_5:.*]] = gpu.thread_id x // CHECK: %[[VAL_6:.*]] = affine.apply #[[$ATTR_0]](){{\[}}%[[VAL_5]]] // CHECK: %[[VAL_7:.*]] = arith.constant 128 : index // CHECK: affine.for %{{.*}} = %[[VAL_6]] to %[[VAL_7]] step 8 { diff --git a/mlir/test/Dialect/GPU/subgroup-mma-vector-unroll.mlir b/mlir/test/Dialect/GPU/subgroup-mma-vector-unroll.mlir index 03aba89c11af..8b0a62e9c638 100644 --- a/mlir/test/Dialect/GPU/subgroup-mma-vector-unroll.mlir +++ b/mlir/test/Dialect/GPU/subgroup-mma-vector-unroll.mlir @@ -7,8 +7,8 @@ func.func @matmul(%lhs: memref<32x32xf32>, %rhs: memref<32x32xf32>, %out: memref %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index %cst_0 = arith.constant 0.000000e+00 : f32 - %3 = gpu.thread_id x - %4 = gpu.thread_id y + %3 = gpu.thread_id x + %4 = gpu.thread_id y %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%4] %6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%3] // CHECK: scf.for {{.*}} -> (vector<16x16xf32>) { @@ -58,8 +58,8 @@ func.func @gathered_matmul(%lhs: memref<32x32xf32>, %rhs: memref<32x32xf32>, %ou %cst_1 = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> %cst_2 = arith.constant dense<1> : vector<4x4xindex> %alloc = memref.alloc() {alignment = 64 : i64} : memref<32x32xf32> - %3 = gpu.thread_id x - %4 = gpu.thread_id y + %3 = gpu.thread_id x + %4 = gpu.thread_id y %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%4] %6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%3] // CHECK: scf.for {{.*}} -> (vector<16x16xf32>) { diff --git a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir index 386793ad8864..0d4f4d590bb4 100644 --- a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir +++ b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir @@ -5,11 +5,11 @@ func.func @subgroupId(%sz : index, %mem: memref) { gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz) threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) { - // CHECK: %[[DIMX:.*]] = gpu.block_dim x - // CHECK-NEXT: %[[DIMY:.*]] = gpu.block_dim y - // CHECK-NEXT: %[[TIDX:.*]] = gpu.thread_id x - // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y - // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z + // CHECK: %[[DIMX:.*]] = gpu.block_dim x + // CHECK-NEXT: %[[DIMY:.*]] = gpu.block_dim y + // CHECK-NEXT: %[[TIDX:.*]] = gpu.thread_id x + // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y + // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z // CHECK-NEXT: %[[T0:.*]] = arith.muli %[[DIMY]], %[[TIDZ]] : index // CHECK-NEXT: %[[T1:.*]] = arith.addi %[[T0]], %[[TIDY]] : index // CHECK-NEXT: %[[T2:.*]] = arith.muli %[[DIMX]], %[[T1]] : index diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir index 465e8fdd6642..7e4a02109227 100644 --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -12,8 +12,8 @@ func.func @blocks_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : %c7 = arith.constant 7 : index %one = arith.constant 1 : index // CHECK: gpu.launch -// CHECK: %[[BLKX:.*]] = gpu.block_id x -// CHECK: %[[BLKY:.*]] = gpu.block_id y +// CHECK: %[[BLKX:.*]] = gpu.block_id x +// CHECK: %[[BLKY:.*]] = gpu.block_id y // CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]]] // CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]]] %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) @@ -59,8 +59,8 @@ func.func @warpgroup_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream // CHECK-DAG: %[[C512:.*]] = arith.constant 512 : index // CHECK: gpu.launch -// CHECK: %[[TIDX:.*]] = gpu.thread_id x -// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y // CHECK-DAG: %[[WG:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]] // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C384]] : index // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index @@ -112,8 +112,8 @@ func.func @warp_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !g // CHECK-DAG: %[[c64:.*]] = arith.constant 64 : index // CHECK: gpu.launch -// CHECK: %[[TIDX:.*]] = gpu.thread_id x -// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y // CHECK-DAG: %[[W:.*]] = affine.apply #[[$MAP]]()[%[[TIDX]]] // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[TIDY]], %[[C3]] : index @@ -162,8 +162,8 @@ func.func @threads_3d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : // CHECK: %[[C9:.*]] = arith.constant 9 : index // CHECK: %[[C7:.*]] = arith.constant 7 : index // CHECK: gpu.launch async [%{{.*}}] blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C1]], %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C12]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) -// CHECK: %[[TIDX:.*]] = gpu.thread_id x -// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y // CHECK: arith.cmpi ult, %[[TIDX]], %[[C9]] : index // CHECK: arith.cmpi ult, %[[TIDY]], %[[C7]] : index // CHECK: memref.load %[[ARGX]][%[[TIDY]], %[[TIDX]]] @@ -215,10 +215,10 @@ func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d { // CHECK: %[[C4:.*]] = arith.constant 4 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C64]], %{{.*}} = %[[C1]]) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C32]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1]]) -// CHECK: %[[BLKX:.*]] = gpu.block_id x -// CHECK: %[[BLKY:.*]] = gpu.block_id y -// CHECK: %[[TIDX:.*]] = gpu.thread_id x -// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: %[[BLKX:.*]] = gpu.block_id x +// CHECK: %[[BLKY:.*]] = gpu.block_id y +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y // CHECK: memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] // CHECK: memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]] scf.forall (%i, %j) in (%c32, %c64) { @@ -288,7 +288,7 @@ func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { -// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDX:.*]] = gpu.thread_id x // CHECK: memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]] // CHECK: memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]] scf.forall (%i) in (%c32) { @@ -322,7 +322,7 @@ func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %s %c9 = arith.constant 9 : index %c7 = arith.constant 7 : index // CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK-NOT: gpu.thread_id z +// CHECK-NOT: gpu.thread_id z %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { @@ -373,9 +373,9 @@ func.func @warpgroup_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %st // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x -// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y -// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z +// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x +// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y +// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z // CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWGLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWGX]]()[%[[TIDX]], %[[TIDY]]] // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWGY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] @@ -429,9 +429,9 @@ func.func @warp_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index // CHECK-DAG: %[[C192:.*]] = arith.constant 192 : index -// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x -// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y -// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z +// CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x +// CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y +// CHECK-DAG: %[[TIDZ:.*]] = gpu.thread_id z // CHECK-DAG: %[[WIDLIN:.*]] = affine.apply #[[$MAPWLIN]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] @@ -495,8 +495,8 @@ func.func @map_multi_level_linear(%x: !type, %y: !type, %t: !type1d, %alpha : f3 %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) { - // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x - // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y + // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id x + // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id y scf.forall (%i, %j) in (%c7, %c9) { %4 = memref.load %x[%i, %j] : !type %5 = memref.load %y[%i, %j] : !type @@ -563,9 +563,9 @@ func.func @block_linear_existing_launch( // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index // CHECK-DAG: %[[C63:.*]] = arith.constant 63 : index // CHECK: gpu.launch async [{{.*}}] blocks({{.*}}) in (%{{.*}} = %[[C12]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads -// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x -// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y -// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z +// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x +// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y +// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z // CHECK-DAG: %[[BIDLIN:.*]] = affine.apply #[[$MAPBLIN]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] // CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] // CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] @@ -617,9 +617,9 @@ func.func @block_linear_generate_launch( // CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index // CHECK-DAG: %[[C9:.*]] = arith.constant 9 : index // CHECK: gpu.launch blocks({{.*}}) in (%{{.*}} = %[[C7]], %{{.*}} = %[[C9]], %{{.*}} = %[[C1]]) threads -// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x -// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y -// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z +// CHECK-DAG: %[[BIDX:.*]] = gpu.block_id x +// CHECK-DAG: %[[BIDY:.*]] = gpu.block_id y +// CHECK-DAG: %[[BIDZ:.*]] = gpu.block_id z // CHECK-DAG: %[[BLX:.*]] = affine.apply #[[$MAPBX]]()[%[[BIDX]]] // CHECK-DAG: %[[BLY:.*]] = affine.apply #[[$MAPBY]]()[%[[BIDX]], %[[BIDY]], %[[BIDZ]]] // CHECK: memref.load %[[ARGX]][%[[BLX]], %[[BLY]]] @@ -659,14 +659,14 @@ func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> { // CHECK: %[[C8:.*]] = arith.constant 8 : index // CHECK: gpu.launch scf.forall (%arg1) in (1) { -// CHECK: %[[BIDX:.*]] = gpu.block_id x +// CHECK: %[[BIDX:.*]] = gpu.block_id x // CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]] %0 = affine.apply #map(%arg1) %subview = memref.subview %arg0[%0] [128] [1] : memref<128xf32> to memref<128xf32, strided<[1], offset: ?>> scf.forall (%arg2) in (4) { -// CHECK: %[[TIDX:.*]] = gpu.thread_id x -// CHECK: %[[TIDY:.*]] = gpu.thread_id y -// CHECK: %[[TIDZ:.*]] = gpu.thread_id z +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: %[[TIDY:.*]] = gpu.thread_id y +// CHECK: %[[TIDZ:.*]] = gpu.thread_id z // CHECK: %[[THX:.*]] = affine.apply #[[$MAPW]]()[%[[TIDX]], %[[TIDY]], %[[TIDZ]]] // CHECK-NOT: scf.if // CHECK: memref.subview %{{.*}}[%[[THX]]] @@ -709,7 +709,7 @@ func.func @simple_fill(%arg0: memref<128x256xf32>) -> memref<128x256xf32> { // CHECK: %[[C6:.*]] = arith.constant 6 : index // CHECK: gpu.launch scf.forall (%arg1) in (1) { - // CHECK: %[[BIDX:.*]] = gpu.block_id x + // CHECK: %[[BIDX:.*]] = gpu.block_id x // CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]] %0 = affine.apply #map(%arg1) %subview = memref.subview %arg0[%0, 0] [128, 256] [1, 1] @@ -719,8 +719,8 @@ func.func @simple_fill(%arg0: memref<128x256xf32>) -> memref<128x256xf32> { // involving threadIdx.x/y by the map_nested_forall_to_threads // transformation. This results in a if (linear_thread_id < 6) conditional. scf.forall (%arg2, %arg3) in (2, 3) { - // CHECK: %[[TIDX:.*]] = gpu.thread_id x - // CHECK: %[[TIDY:.*]] = gpu.thread_id y + // CHECK: %[[TIDX:.*]] = gpu.thread_id x + // CHECK: %[[TIDY:.*]] = gpu.thread_id y // CHECK: %[[LID:.*]] = affine.apply #[[$MAPLANE]]()[%[[TIDX]], %[[TIDY]]] // CHECK: %[[COND:.*]] = arith.cmpi ult, %[[LID]], %[[C6]] // CHECK: scf.if %[[COND]] @@ -777,7 +777,7 @@ func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> { // CHECK: gpu.launch scf.forall (%arg1) in (1) { -// CHECK: %[[BIDX:.*]] = gpu.block_id x +// CHECK: %[[BIDX:.*]] = gpu.block_id x // CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]] %0 = affine.apply #map(%arg1) %subview = memref.subview %arg0[%0] [128] [1] : memref<128xf32> to memref<128xf32, strided<[1], offset: ?>> @@ -786,8 +786,8 @@ func.func @simple_fill(%arg0: memref<128xf32>) -> memref<128xf32> { // involving threadIdx.x/y by the map_nested_forall_to_threads // transformation. This results in a if (linear_thread_id < 6) conditional. scf.forall (%arg2, %arg3) in (2, 3) { - // CHECK: %[[TIDX:.*]] = gpu.thread_id x - // CHECK: %[[TIDY:.*]] = gpu.thread_id y + // CHECK: %[[TIDX:.*]] = gpu.thread_id x + // CHECK: %[[TIDY:.*]] = gpu.thread_id y // CHECK: %[[LIN_W:.*]] = affine.apply #[[$MAP_LIN_W]]()[%[[TIDX]], %[[TIDY]]] // diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir index a7d2565cff74..2c236d48ae78 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul.mlir @@ -20,10 +20,10 @@ // CHECK-SAME: %[[VAL_6:.*6]]: memref) kernel { // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_8:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_9:.*]] = gpu.block_id x -// CHECK: %[[VAL_10:.*]] = gpu.block_dim x -// CHECK: %[[VAL_11:.*]] = gpu.thread_id x -// CHECK: %[[VAL_12:.*]] = gpu.grid_dim x +// CHECK: %[[VAL_9:.*]] = gpu.block_id x +// CHECK: %[[VAL_10:.*]] = gpu.block_dim x +// CHECK: %[[VAL_11:.*]] = gpu.thread_id x +// CHECK: %[[VAL_12:.*]] = gpu.grid_dim x // CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_9]], %[[VAL_10]] : index // CHECK: %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_11]] : index // CHECK: %[[VAL_15:.*]] = arith.muli %[[VAL_10]], %[[VAL_12]] : index diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir index 0c5ff55dd863..16af4eba8791 100644 --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec.mlir @@ -18,10 +18,10 @@ // CHECK-SAME: %[[VAL_4:.*4]]: memref, // CHECK-SAME: %[[VAL_5:.*5]]: memref) kernel { // CHECK: %[[VAL_6:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_7:.*]] = gpu.block_id x -// CHECK: %[[VAL_8:.*]] = gpu.block_dim x -// CHECK: %[[VAL_9:.*]] = gpu.thread_id x -// CHECK: %[[VAL_10:.*]] = gpu.grid_dim x +// CHECK: %[[VAL_7:.*]] = gpu.block_id x +// CHECK: %[[VAL_8:.*]] = gpu.block_dim x +// CHECK: %[[VAL_9:.*]] = gpu.thread_id x +// CHECK: %[[VAL_10:.*]] = gpu.grid_dim x // CHECK: %[[VAL_11:.*]] = arith.muli %[[VAL_7]], %[[VAL_8]] : index // CHECK: %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_9]] : index // CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_8]], %[[VAL_10]] : index diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index 0202a90ac60c..691913b3bd5d 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -1531,7 +1531,7 @@ func.func @vector_insert_strided_slice_2d_to_2d(%laneid: index) -> (vector<64x1x // CHECK-PROP-SAME: %[[AR1:[^ :]*]]: memref<1x4x2xi32>, // CHECK-PROP-SAME: %[[AR2:[^ :]*]]: memref<1x4x1024xf32>) // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-PROP-DAG: %[[THREADID:.*]] = gpu.thread_id x +// CHECK-PROP-DAG: %[[THREADID:.*]] = gpu.thread_id x // CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]] // CHECK-PROP: %[[GATHER:.*]] = vector.gather %[[AR1]][{{.*}}] // CHECK-PROP: %[[EXTRACT:.*]] = vector.shape_cast %[[GATHER]] : vector<1x64xi32> to vector<64xi32> @@ -1542,7 +1542,7 @@ func.func @vector_insert_strided_slice_2d_to_2d(%laneid: index) -> (vector<64x1x // CHECK-PROP: %[[TRANSFERREAD:.*]] = vector.transfer_read %[[AR2]][%[[C0]], %[[W]], %[[APPLY]]], // CHECK-PROP: return %[[TRANSFERREAD]] func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 : memref<1x4x2xi32>, %ar2 : memref<1x4x1024xf32>)-> vector<2xf32> { - %0 = gpu.thread_id x + %0 = gpu.thread_id x %c0_i32 = arith.constant 0 : index %c0 = arith.constant 0 : index %cst = arith.constant dense<0> : vector<1x64xi32> diff --git a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir index 4a0117bfc1df..b34809c891c4 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir @@ -19,15 +19,15 @@ module attributes {gpu.container_module} { } gpu.module @gpumodule { gpu.func @kernel_cluster() kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array} { - %cidX = gpu.cluster_id x - %cidY = gpu.cluster_id y - %cidZ = gpu.cluster_id z - %cdimX = gpu.cluster_dim_blocks x - %cdimY = gpu.cluster_dim_blocks y - %cdimZ = gpu.cluster_dim_blocks z - %bidX = gpu.block_id x - %bidY = gpu.block_id y - %bidZ = gpu.block_id z + %cidX = gpu.cluster_id x + %cidY = gpu.cluster_id y + %cidZ = gpu.cluster_id z + %cdimX = gpu.cluster_dim_blocks x + %cdimY = gpu.cluster_dim_blocks y + %cdimZ = gpu.cluster_dim_blocks z + %bidX = gpu.block_id x + %bidY = gpu.block_id y + %bidZ = gpu.block_id z %cidX_i32 = index.casts %cidX : index to i32 %cidY_i32 = index.casts %cidY : index to i32 %cidZ_i32 = index.casts %cidZ : index to i32 diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir index 37564de7442c..22474cbcd39f 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir @@ -147,7 +147,7 @@ func.func @main() { %c57344 = arith.constant 57344 : index %c40960 = arith.constant 40960 : index - %tidx = gpu.thread_id x + %tidx = gpu.thread_id x %dynsmem = gpu.dynamic_shared_memory : memref> %lhsShmem = memref.view %dynsmem[%c0][] : memref> to memref<2x128x64xf16, #gpu.address_space> %rhsShmem = memref.view %dynsmem[%c32768][] : memref> to memref<2x64x128xf16, #gpu.address_space> diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir index db7754c89dca..39bad38f3646 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir @@ -147,7 +147,7 @@ func.func @main() { %c57344 = arith.constant 57344 : index %c40960 = arith.constant 40960 : index - %tidx = gpu.thread_id x + %tidx = gpu.thread_id x %dynsmem = gpu.dynamic_shared_memory : memref> %lhsShmem = memref.view %dynsmem[%c0][] : memref> to memref<2x128x64xf16, #gpu.address_space> %rhsShmem = memref.view %dynsmem[%c32768][] : memref> to memref<2x64x128xf16, #gpu.address_space> diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir index afbbeb025a57..f281c028ebca 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir @@ -70,17 +70,17 @@ module { %26 = gpu.dynamic_shared_memory : memref> %view = memref.view %26[%c0][] : memref> to memref<2x2x64x64xf16, #gpu.address_space> %27 = nvgpu.mbarrier.create -> > - %thread_id_x = gpu.thread_id x + %thread_id_x = gpu.thread_id x %28 = arith.index_cast %thread_id_x : index to i32 %29 = arith.shrui %28, %c5_i32 : i32 - %30 = nvvm.shfl.sync idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32 + %30 = nvvm.shfl.sync idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32 %31 = arith.cmpi eq, %30, %c0_i32 : i32 %32 = nvvm.elect.sync -> i1 %33 = arith.andi %31, %32 : i1 scf.if %33 { nvgpu.mbarrier.init %27[%c0], %c1 : > } - %34 = nvvm.shfl.sync idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32 + %34 = nvvm.shfl.sync idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32 %35 = arith.cmpi eq, %34, %c0_i32 : i32 %36 = nvvm.elect.sync -> i1 %37 = arith.andi %35, %36 : i1 @@ -95,13 +95,13 @@ module { %39 = arith.muli %arg15, %c64 : index %subview = memref.subview %view[%arg14, %arg15, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xf16, #gpu.address_space> to memref<64x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space> %subview_0 = memref.subview %dstMemref[%38, %39] [64, 64] [1, 1] : memref<128x128xf16> to memref<64x64xf16, strided<[128, 1], offset: ?>> - %block_dim_x = gpu.block_dim x - %thread_id_y = gpu.thread_id y + %block_dim_x = gpu.block_dim x + %thread_id_y = gpu.thread_id y %40 = arith.muli %thread_id_y, %block_dim_x : index %41 = arith.addi %thread_id_x, %40 : index - %block_dim_y = gpu.block_dim y + %block_dim_y = gpu.block_dim y %42 = arith.muli %block_dim_x, %block_dim_y : index - %thread_id_z = gpu.thread_id z + %thread_id_z = gpu.thread_id z %43 = arith.muli %thread_id_z, %42 : index %44 = arith.addi %41, %43 : index %45 = arith.cmpi eq, %44, %c0 : index diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir index ae96568a4650..69959893bed6 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir @@ -73,8 +73,8 @@ module @mymod { // Step 5. Launch a GPU kernel gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) { - %5 = gpu.block_dim x - %6 = gpu.thread_id x + %5 = gpu.block_dim x + %6 = gpu.thread_id x %7 = memref.get_global @bufferLhsGlobal : !shmemlhs // Step 6. Initialize the mbarrier diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir index b209114e957f..9c0988d8a129 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir @@ -94,8 +94,8 @@ module @mymod { // Step 4. Launch a GPU kernel gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) dynamic_shared_memory_size %c32768_i32 { - %5 = gpu.block_dim x - %6 = gpu.thread_id x + %5 = gpu.block_dim x + %6 = gpu.thread_id x %c0 = arith.constant 0 : index %txcount = arith.constant 32768 : index %c24576 = arith.constant 24576 : index diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir index 31ee19500b85..fc9f6c3d17ab 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir @@ -68,8 +68,8 @@ module @mymod { %3 = nvgpu.tma.create.descriptor %cast box[%c64, %c8] : memref<*xf32> -> , swizzle = none, l2promo = none, oob = zero, interleave = none> %4 = nvgpu.tma.create.descriptor %cast_3 box[%c8, %c128] : memref<*xf32> -> , swizzle = none, l2promo = none, oob = zero, interleave = none> gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) { - %5 = gpu.block_dim x - %6 = gpu.thread_id x + %5 = gpu.block_dim x + %6 = gpu.thread_id x %7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3> %8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3> %9 = nvgpu.mbarrier.create -> > diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir index 6ba9c1639019..39f876bf5ccd 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir @@ -91,7 +91,7 @@ func.func @main() { linalg.copy ins(%memref: memref<64x8xf32>) outs(%out: memref<64x8xf32, 3>) linalg.copy ins(%memref_1: memref<8x128xf32>) outs(%out_1: memref<8x128xf32, 3>) - %6 = gpu.thread_id x + %6 = gpu.thread_id x %10 = arith.cmpi eq, %6, %c0 : index scf.if %10 { %11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3> diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir index 7e66dee0272f..ec82ad6a2b83 100644 --- a/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir @@ -42,9 +42,9 @@ module @add attributes {gpu.container_module} { attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y - %2 = gpu.block_id z + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> %5 = arith.addf %3, %4 : f32 diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir index df8fbe4d86d9..6900e866cba3 100644 --- a/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir +++ b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir @@ -42,8 +42,8 @@ module @add attributes {gpu.container_module} { attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y + %0 = gpu.block_id x + %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<3x3xi64> %3 = memref.load %arg1[%0, %1] : memref<3x3xi64> %4 = arith.addi %2, %3 : i64 diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir index cd99f2c70dc6..4eb729e67e1b 100644 --- a/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir @@ -39,9 +39,9 @@ module @add attributes {gpu.container_module} { attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y - %2 = gpu.block_id z + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> %5 = arith.addf %3, %4 : f32 diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir index d0f21873e6e2..ca503ab35f95 100644 --- a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir @@ -49,8 +49,8 @@ module @relu attributes {gpu.container_module} { gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_relu(%arg0: memref<4x5xf32>, %arg1: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %zero = arith.constant 0.000000e+00 : f32 - %0 = gpu.block_id x - %1 = gpu.block_id y + %0 = gpu.block_id x + %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> %3 = arith.cmpf ogt, %2, %zero : f32 %4 = arith.select %3, %2, %zero : f32 diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir index fad0d1d313a7..7c369036e26f 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir @@ -39,9 +39,9 @@ module @add attributes {gpu.container_module} { } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y - %2 = gpu.block_id z + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> %5 = arith.addf %3, %4 : f32 diff --git a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir index 73d7fe3644c4..12a9caf6f72b 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir @@ -39,8 +39,8 @@ module @add attributes {gpu.container_module} { } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y + %0 = gpu.block_id x + %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<3x3xi64> %3 = memref.load %arg1[%0, %1] : memref<3x3xi64> %4 = arith.addi %2, %3 : i64 diff --git a/mlir/test/Integration/GPU/SYCL/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-memcpy-addf32-to-spirv.mlir index 32888efe3457..ada6dd548ed7 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-memcpy-addf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-memcpy-addf32-to-spirv.mlir @@ -36,9 +36,9 @@ module @add attributes {gpu.container_module} { } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = gpu.block_id x - %1 = gpu.block_id y - %2 = gpu.block_id z + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> %5 = arith.addf %3, %4 : f32 diff --git a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir index 9d45f405e9f0..24384e4a68a4 100644 --- a/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir +++ b/mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir @@ -49,8 +49,8 @@ module @relu attributes {gpu.container_module} { gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_relu(%arg0: memref<4x5xf32>, %arg1: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %zero = arith.constant 0.000000e+00 : f32 - %0 = gpu.block_id x - %1 = gpu.block_id y + %0 = gpu.block_id x + %1 = gpu.block_id y %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> %3 = arith.cmpf ogt, %2, %zero : f32 %4 = arith.select %3, %2, %zero : f32 diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py index 331993ee1882..0956805b2748 100644 --- a/mlir/test/python/dialects/gpu/dialect.py +++ b/mlir/test/python/dialects/gpu/dialect.py @@ -29,7 +29,7 @@ def testMMAElementWiseAttr(): module = Module.create() with InsertionPoint(module.body): gpu.BlockDimOp(gpu.Dimension.y) - # CHECK: %block_dim_y = gpu.block_dim y + # CHECK: %block_dim_y = gpu.block_dim y print(module) pass @@ -146,18 +146,18 @@ def testGPUFuncOp(): # CHECK: gpu.module @gpu_module # CHECK: gpu.func @kernel0() kernel { - # CHECK: %[[VAL_0:.*]] = gpu.global_id x + # CHECK: %[[VAL_0:.*]] = gpu.global_id x # CHECK: gpu.return # CHECK: } # CHECK: gpu.func @kernel1() kernel attributes # CHECK-SAME: known_block_size = array # CHECK-SAME: known_grid_size = array - # CHECK: %[[VAL_0:.*]] = gpu.global_id x + # CHECK: %[[VAL_0:.*]] = gpu.global_id x # CHECK: gpu.return # CHECK: } # CHECK: gpu.func @non_kernel_func( # CHECK-SAME: %[[ARG0:.*]]: index {gpu.some_attribute = "foo"}) { - # CHECK: %[[GLOBAL_ID_0:.*]] = gpu.global_id x + # CHECK: %[[GLOBAL_ID_0:.*]] = gpu.global_id x # CHECK: gpu.return # CHECK: }