[OpenACC][CIR] Lowering for vector_length on combined constructs
Another simple one, added tests and implemented, just like num_gangs and num_workers.
This commit is contained in:
parent
97a58b04c6
commit
716062d943
@ -242,12 +242,10 @@ public:
|
||||
operation.addVectorLengthOperand(builder.getContext(),
|
||||
createIntExpr(clause.getIntExpr()),
|
||||
lastDeviceTypeValues);
|
||||
} else if constexpr (isOneOfTypes<OpTy, mlir::acc::SerialOp>) {
|
||||
llvm_unreachable("vector_length not valid on serial");
|
||||
} else if constexpr (isCombinedType<OpTy>) {
|
||||
applyToComputeOp(clause);
|
||||
} else {
|
||||
// TODO: When we've implemented this for everything, switch this to an
|
||||
// unreachable. Combined constructs remain.
|
||||
return clauseNotImplemented(clause);
|
||||
llvm_unreachable("Unknown construct kind in VisitVectorLengthClause");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -879,4 +879,72 @@ extern "C" void acc_combined(int N, int cond) {
|
||||
// CHECK-NEXT: } loc
|
||||
// CHECK-NEXT: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
//
|
||||
#pragma acc parallel loop vector_length(cond)
|
||||
for(unsigned I = 0; I < N; ++I);
|
||||
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
|
||||
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
|
||||
// CHECK-NEXT: acc.parallel combined(loop) vector_length(%[[CONV_CAST]] : si32) {
|
||||
// CHECK-NEXT: acc.loop combined(parallel) {
|
||||
// CHECK: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
// CHECK-NEXT: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
|
||||
#pragma acc kernels loop vector_length(cond) device_type(nvidia) vector_length(2u)
|
||||
for(unsigned I = 0; I < N; ++I);
|
||||
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
|
||||
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
|
||||
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
|
||||
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
|
||||
// CHECK-NEXT: acc.kernels combined(loop) vector_length(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
|
||||
// CHECK-NEXT: acc.loop combined(kernels) {
|
||||
// CHECK: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
// CHECK-NEXT: acc.terminator
|
||||
// CHECK-NEXT: } loc
|
||||
|
||||
#pragma acc parallel loop vector_length(cond) device_type(nvidia, host) vector_length(2) device_type(radeon) vector_length(3)
|
||||
for(unsigned I = 0; I < N; ++I);
|
||||
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
|
||||
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
|
||||
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
|
||||
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
|
||||
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
|
||||
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
|
||||
// CHECK-NEXT: acc.parallel combined(loop) vector_length(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
|
||||
// CHECK-NEXT: acc.loop combined(parallel) {
|
||||
// CHECK: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
// CHECK-NEXT: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
|
||||
#pragma acc kernels loop vector_length(cond) device_type(nvidia) vector_length(2) device_type(radeon, multicore) vector_length(4)
|
||||
for(unsigned I = 0; I < N; ++I);
|
||||
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
|
||||
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
|
||||
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
|
||||
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
|
||||
// CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
|
||||
// CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
|
||||
// CHECK-NEXT: acc.kernels combined(loop) vector_length(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) {
|
||||
// CHECK-NEXT: acc.loop combined(kernels) {
|
||||
// CHECK: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
// CHECK-NEXT: acc.terminator
|
||||
// CHECK-NEXT: } loc
|
||||
|
||||
#pragma acc parallel loop device_type(nvidia) vector_length(2) device_type(radeon) vector_length(3)
|
||||
for(unsigned I = 0; I < N; ++I);
|
||||
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
|
||||
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
|
||||
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
|
||||
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
|
||||
// CHECK-NEXT: acc.parallel combined(loop) vector_length(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
|
||||
// CHECK-NEXT: acc.loop combined(parallel) {
|
||||
// CHECK: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
// CHECK-NEXT: acc.yield
|
||||
// CHECK-NEXT: } loc
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user