Erich Keane b8f5cbb4ff
[OpenACC][CIR] 'cache' construct lowering (#146915)
The 'cache' construct is an interesting one, in that it doesn't take any
clauses, and is exclusively a collection of variables. Lowering wise,
  these just get added to the associated acc.loop.  This did require
  some work to ensure that the cache doesn't have 'vars' that aren't
  inside of the loop, but Sema is taking care of that with a warning.

Otherwise this is just a fairly simple amount of lowering, where each
'var' in the list creates an acc.cache, which is added to the acc.loop.
2025-07-07 08:35:05 -07:00

133 lines
8.2 KiB
C

// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
void acc_cache() {
// CHECK: cir.func{{.*}} @acc_cache() {
int iArr[10];
// CHECK-NEXT: %[[IARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["iArr"]
float fArr[10];
// CHECK-NEXT: %[[FARR:.*]] = cir.alloca !cir.array<!cir.float x 10>, !cir.ptr<!cir.array<!cir.float x 10>>, ["fArr"]
#pragma acc cache(iArr[1], fArr[1:5])
// This does nothing, as it is not in a loop.
#pragma acc parallel
{
#pragma acc cache(iArr[1], fArr[1:5])
// This does nothing, as it is not in a loop.
}
// CHECK-NEXT: acc.parallel {
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
#pragma acc loop
for(int i = 0; i < 5; ++i) {
for(int j = 0; j < 5; ++j) {
#pragma acc cache(iArr[1], fArr[1:5])
}
}
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
//
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
//
// CHECK-NEXT: acc.loop cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
#pragma acc loop
for(int i = 0; i < 5; ++i) {
#pragma acc cache(iArr[1], fArr[1:5])
}
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
//
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
//
// CHECK-NEXT: acc.loop cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
#pragma acc parallel loop
for(int i = 0; i < 5; ++i) {
#pragma acc cache(iArr[1], fArr[1:5])
}
// CHECK-NEXT: acc.parallel combined(loop) {
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
//
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
//
// CHECK-NEXT: acc.loop combined(parallel) cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
#pragma acc parallel loop
for(int i = 0; i < 5; ++i) {
int localArr[5];
// The first term here isn't lowered, because it references data inside of the 'loop'.
#pragma acc cache(localArr[i], iArr[1], fArr[1:5])
}
// CHECK-NEXT: acc.parallel combined(loop) {
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
//
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
//
// CHECK-NEXT: acc.loop combined(parallel) cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
}