
The 'cache' construct is an interesting one, in that it doesn't take any clauses, and is exclusively a collection of variables. Lowering wise, these just get added to the associated acc.loop. This did require some work to ensure that the cache doesn't have 'vars' that aren't inside of the loop, but Sema is taking care of that with a warning. Otherwise this is just a fairly simple amount of lowering, where each 'var' in the list creates an acc.cache, which is added to the acc.loop.
133 lines
8.2 KiB
C
133 lines
8.2 KiB
C
// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
|
|
|
|
void acc_cache() {
|
|
// CHECK: cir.func{{.*}} @acc_cache() {
|
|
|
|
int iArr[10];
|
|
// CHECK-NEXT: %[[IARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["iArr"]
|
|
float fArr[10];
|
|
// CHECK-NEXT: %[[FARR:.*]] = cir.alloca !cir.array<!cir.float x 10>, !cir.ptr<!cir.array<!cir.float x 10>>, ["fArr"]
|
|
|
|
#pragma acc cache(iArr[1], fArr[1:5])
|
|
// This does nothing, as it is not in a loop.
|
|
|
|
#pragma acc parallel
|
|
{
|
|
#pragma acc cache(iArr[1], fArr[1:5])
|
|
// This does nothing, as it is not in a loop.
|
|
}
|
|
// CHECK-NEXT: acc.parallel {
|
|
// CHECK-NEXT: acc.yield
|
|
// CHECK-NEXT: } loc
|
|
|
|
#pragma acc loop
|
|
for(int i = 0; i < 5; ++i) {
|
|
for(int j = 0; j < 5; ++j) {
|
|
#pragma acc cache(iArr[1], fArr[1:5])
|
|
}
|
|
}
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
|
|
//
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
|
|
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
|
|
//
|
|
// CHECK-NEXT: acc.loop cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
|
|
// CHECK: acc.yield
|
|
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
|
|
|
|
#pragma acc loop
|
|
for(int i = 0; i < 5; ++i) {
|
|
#pragma acc cache(iArr[1], fArr[1:5])
|
|
}
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
|
|
//
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
|
|
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
|
|
//
|
|
// CHECK-NEXT: acc.loop cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
|
|
// CHECK: acc.yield
|
|
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
|
|
|
|
#pragma acc parallel loop
|
|
for(int i = 0; i < 5; ++i) {
|
|
#pragma acc cache(iArr[1], fArr[1:5])
|
|
}
|
|
// CHECK-NEXT: acc.parallel combined(loop) {
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
|
|
//
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
|
|
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
|
|
//
|
|
// CHECK-NEXT: acc.loop combined(parallel) cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
|
|
// CHECK: acc.yield
|
|
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
|
|
// CHECK-NEXT: acc.yield
|
|
// CHECK-NEXT: } loc
|
|
|
|
#pragma acc parallel loop
|
|
for(int i = 0; i < 5; ++i) {
|
|
int localArr[5];
|
|
// The first term here isn't lowered, because it references data inside of the 'loop'.
|
|
#pragma acc cache(localArr[i], iArr[1], fArr[1:5])
|
|
}
|
|
// CHECK-NEXT: acc.parallel combined(loop) {
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE1:.*]] = acc.cache varPtr(%[[IARR]] : !cir.ptr<!cir.array<!s32i x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 10>> {name = "iArr[1]", structured = false}
|
|
//
|
|
// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
|
|
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[FIVE:.*]] = cir.const #cir.int<5>
|
|
// CHECK-NEXT: %[[FIVE_CAST:.*]] = builtin.unrealized_conversion_cast %[[FIVE]] : !s32i to si32
|
|
// CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 : i64
|
|
// CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
|
|
// CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[FIVE_CAST]] : si32) stride(%[[ONE_CONST]] : i64) startIdx(%[[ZERO_CONST]] : i64)
|
|
// CHECK-NEXT: %[[CACHE2:.*]] = acc.cache varPtr(%[[FARR]] : !cir.ptr<!cir.array<!cir.float x 10>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 10>> {name = "fArr[1:5]", structured = false}
|
|
//
|
|
// CHECK-NEXT: acc.loop combined(parallel) cache(%[[CACHE1]], %[[CACHE2]] : !cir.ptr<!cir.array<!s32i x 10>>, !cir.ptr<!cir.array<!cir.float x 10>>) {
|
|
// CHECK: acc.yield
|
|
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
|
|
// CHECK-NEXT: acc.yield
|
|
// CHECK-NEXT: } loc
|
|
}
|