[Polly] Remove Polly-ACC.

Polly-ACC is unmaintained and since it has never been ported to the NPM pipeline, since D136621 it is not even accessible anymore without manually specifying the passes on the `opt` command line.

Since there is no plan to put it to a maintainable state, remove it from Polly.

Reviewed By: grosser

Differential Revision: https://reviews.llvm.org/D142580
This commit is contained in:
Michael Kruse 2023-01-25 14:03:57 -06:00
parent 115c7beda7
commit 19afbfe331
166 changed files with 29 additions and 31394 deletions

View File

@ -85,31 +85,6 @@ endif ()
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
set(GPU_CODEGEN FALSE)
if (POLLY_ENABLE_GPGPU_CODEGEN)
# Do not require CUDA/OpenCL, as GPU code generation test cases can be run
# without a CUDA/OpenCL library.
if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
FIND_PACKAGE(CUDA)
FIND_PACKAGE(OpenCL)
set(GPU_CODEGEN TRUE)
else()
message(WARNING "The LLVM NVPTX target is required for GPU code generation")
endif()
endif(POLLY_ENABLE_GPGPU_CODEGEN)
# Support GPGPU code generation if the library is available.
if (CUDA_FOUND)
add_definitions(-DHAS_LIBCUDART)
INCLUDE_DIRECTORIES( ${CUDA_INCLUDE_DIRS} )
endif(CUDA_FOUND)
if (OpenCL_FOUND)
add_definitions(-DHAS_LIBOPENCL)
INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
endif(OpenCL_FOUND)
option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
if (NOT POLLY_BUNDLED_ISL)
find_package(ISL MODULE REQUIRED)
@ -155,7 +130,6 @@ add_subdirectory(test)
if (POLLY_GTEST_AVAIL)
add_subdirectory(unittests)
endif ()
add_subdirectory(tools)
add_subdirectory(cmake)
# TODO: docs.

View File

@ -27,9 +27,6 @@ if (NOT WIN32 AND LLVM_ENABLE_PIC)
# LLVMPolly is a dummy target on Win or if PIC code is disabled.
list(APPEND POLLY_CONFIG_EXPORTED_TARGETS LLVMPolly)
endif()
if (POLLY_ENABLE_GPGPU_CODEGEN)
list(APPEND POLLY_CONFIG_EXPORTED_TARGETS PollyPPCG)
endif()
# Get the target type for every exported target
foreach(tgt IN LISTS POLLY_CONFIG_EXPORTED_TARGETS)

View File

@ -8,7 +8,6 @@ find_package(LLVM ${LLVM_VERSION} EXACT REQUIRED CONFIG
set(Polly_CMAKE_DIR ${CMAKE_CURRENT_LIST_DIR})
set(Polly_BUNDLED_ISL @POLLY_BUNDLED_ISL@)
set(Polly_ENABLE_GPGPU_CODEGEN @POLLY_ENABLE_GPGPU_CODEGEN@)
set(Polly_DEFINITIONS ${LLVM_DEFINITIONS})
set(Polly_INCLUDE_DIRS @POLLY_CONFIG_INCLUDE_DIRS@ ${LLVM_INCLUDE_DIRS})
@ -19,17 +18,9 @@ set(Polly_LIBRARIES ${LLVM_LIBRARIES} ${Polly_EXPORTED_TARGETS})
# Imported Targets:
@ISL_CONFIG_CODE@
if (Polly_ENABLE_GPGPU_CODEGEN AND NOT TARGET PollyPPCG)
add_library(PollyPPCG @POLLY_CONFIG_TARGET_PollyPPCG_TYPE@ IMPORTED)
set_property(TARGET PollyPPCG PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
endif()
if (NOT TARGET Polly)
add_library(Polly @POLLY_CONFIG_TARGET_Polly_TYPE@ IMPORTED)
set_property(TARGET Polly PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
if (Polly_ENABLE_GPGPU_CODEGEN)
set_property(TARGET Polly APPEND PROPERTY INTERFACE_LINK_LIBRARIES PollyPPCG)
endif()
endif()
if (NOT TARGET LLVMPolly)

View File

@ -21,3 +21,5 @@ In Polly |version| the following important changes have been incorporated.
In the future we hope that Polly can collaborate better with LoopVectorize,
like Polly marking a loop is safe to vectorize with a specific simd width,
instead of replicating its functionality.
- Polly-ACC has been removed.

View File

@ -1,33 +0,0 @@
//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Take a scop created by ScopInfo and map it to GPU code using the ppcg
// GPU mapping strategy.
//
//===----------------------------------------------------------------------===//
#ifndef POLLY_PPCGCODEGENERATION_H
#define POLLY_PPCGCODEGENERATION_H
/// The GPU Architecture to target.
enum GPUArch { NVPTX64, SPIR32, SPIR64 };
/// The GPU Runtime implementation to use.
enum GPURuntime { CUDA, OpenCL };
namespace polly {
extern bool PollyManagedMemory;
/// Use for pass instantiation defaults.
/// @{
extern GPURuntime GPURuntimeChoice;
extern GPUArch GPUArchChoice;
/// @}
} // namespace polly
#endif // POLLY_PPCGCODEGENERATION_H

View File

@ -30,24 +30,20 @@ namespace polly {
struct RuntimeDebugBuilder {
/// Generate a constant string into the builder's llvm::Module which can be
/// passed to createGPUPrinter() or createGPUPrinter().
/// passed to createCPUPrinter().
///
/// @param Builder The builder used to emit the printer calls.
/// @param Str The string to be printed.
/// @return A global containing @p Str.
static llvm::Value *getPrintableString(PollyIRBuilder &Builder,
llvm::StringRef Str) {
// TODO: Get rid of magic number 4. It it NVPTX's constant address space and
// works on X86 (CPU) only because its backend ignores the address space.
return Builder.CreateGlobalStringPtr(Str, "", 4);
}
llvm::StringRef Str);
/// Return whether an llvm::Value of the type @p Ty is printable for
/// debugging.
///
/// That is, whether such a value can be passed to createGPUPrinter() or
/// createGPUPrinter() to be dumped as runtime. If false is returned, those
/// That is, whether such a value can be passed to createGPUPrinter()
/// to be dumped as runtime. If false is returned, those
/// functions will fail.
static bool isPrintable(llvm::Type *Ty);
@ -64,62 +60,41 @@ struct RuntimeDebugBuilder {
template <typename... Args>
static void createCPUPrinter(PollyIRBuilder &Builder, Args... args) {
std::vector<llvm::Value *> Vector;
createPrinter(Builder, /* CPU */ false, Vector, args...);
}
/// Print a set of LLVM-IR Values or StringRefs on an NVIDIA GPU.
///
/// This function emits a call to vprintf that will print the given
/// arguments from within a kernel thread. It is useful for debugging
/// CUDA program kernels. All arguments given in this list will be
/// automatically concatenated and the resulting string will be printed
/// atomically. We also support ArrayRef arguments, which can be used to
/// provide for example a list of thread-id values.
///
/// @param Builder The builder used to emit the printer calls.
/// @param Args The list of values to print.
template <typename... Args>
static void createGPUPrinter(PollyIRBuilder &Builder, Args... args) {
std::vector<llvm::Value *> Vector;
createPrinter(Builder, /* GPU */ true, Vector, args...);
createPrinter(Builder, Vector, args...);
}
private:
/// Handle Values.
template <typename... Args>
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
static void createPrinter(PollyIRBuilder &Builder,
std::vector<llvm::Value *> &Values,
llvm::Value *Value, Args... args) {
Values.push_back(Value);
createPrinter(Builder, UseGPU, Values, args...);
createPrinter(Builder, Values, args...);
}
/// Handle StringRefs.
template <typename... Args>
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
static void createPrinter(PollyIRBuilder &Builder,
std::vector<llvm::Value *> &Values,
llvm::StringRef String, Args... args) {
Values.push_back(getPrintableString(Builder, String));
createPrinter(Builder, UseGPU, Values, args...);
createPrinter(Builder, Values, args...);
}
/// Handle ArrayRefs.
template <typename... Args>
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
static void createPrinter(PollyIRBuilder &Builder,
std::vector<llvm::Value *> &Values,
llvm::ArrayRef<llvm::Value *> Array, Args... args) {
Values.insert(Values.end(), Array.begin(), Array.end());
createPrinter(Builder, UseGPU, Values, args...);
createPrinter(Builder, Values, args...);
}
/// Print a list of Values.
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
static void createPrinter(PollyIRBuilder &Builder,
llvm::ArrayRef<llvm::Value *> Values);
/// Print a list of Values on a GPU.
static void createGPUPrinterT(PollyIRBuilder &Builder,
llvm::ArrayRef<llvm::Value *> Values);
/// Print a list of Values on a CPU.
static void createCPUPrinterT(PollyIRBuilder &Builder,
llvm::ArrayRef<llvm::Value *> Values);
@ -145,22 +120,6 @@ private:
///
/// @parma Builder The builder used to insert the code.
static void createFlush(PollyIRBuilder &Builder);
/// Get (and possibly insert) a NVIDIA address space cast call.
static llvm::Function *getAddressSpaceCast(PollyIRBuilder &Builder,
unsigned Src, unsigned Dst,
unsigned SrcBits = 8,
unsigned DstBits = 8);
/// Get identifiers that describe the currently executed GPU thread.
///
/// The result will be a vector that if passed to the GPU printer will result
/// into a string (initialized to values corresponding to the printing
/// thread):
///
/// "> block-id: bidx bid1y bidz | thread-id: tidx tidy tidz "
static std::vector<llvm::Value *>
getGPUThreadIdentifiers(PollyIRBuilder &Builder);
};
} // namespace polly

View File

@ -12,7 +12,4 @@
#ifndef POLLY_CONFIG_H
#define POLLY_CONFIG_H
#cmakedefine CUDA_FOUND
#cmakedefine GPU_CODEGEN
#endif

View File

@ -14,7 +14,6 @@
#ifndef POLLY_LINKALLPASSES_H
#define POLLY_LINKALLPASSES_H
#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/Config/config.h"
#include "polly/Support/DumpFunctionPass.h"
#include "polly/Support/DumpModulePass.h"
@ -54,14 +53,6 @@ llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
llvm::Pass *createIslAstInfoWrapperPassPass();
llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
llvm::Pass *createCodeGenerationPass();
#ifdef GPU_CODEGEN
llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
GPURuntime Runtime = GPURuntime::CUDA);
llvm::Pass *
createManagedMemoryRewritePassPass(GPUArch Arch = GPUArch::NVPTX64,
GPURuntime Runtime = GPURuntime::CUDA);
#endif
llvm::Pass *createIslScheduleOptimizerWrapperPass();
llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
llvm::Pass *createFlattenSchedulePass();
@ -113,10 +104,6 @@ struct PollyForcePassLinking {
polly::createIslAstInfoWrapperPassPass();
polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
polly::createCodeGenerationPass();
#ifdef GPU_CODEGEN
polly::createPPCGCodeGenerationPass();
polly::createManagedMemoryRewritePassPass();
#endif
polly::createIslScheduleOptimizerWrapperPass();
polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs());
polly::createMaximalStaticExpansionPass();
@ -156,10 +143,6 @@ void initializeDependenceInfoPrinterLegacyFunctionPassPass(
void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
void initializeCodeGenerationPass(llvm::PassRegistry &);
#ifdef GPU_CODEGEN
void initializePPCGCodeGenerationPass(llvm::PassRegistry &);
void initializeManagedMemoryRewritePassPass(llvm::PassRegistry &);
#endif
void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &);

View File

@ -1684,9 +1684,6 @@ private:
/// Number of copy statements.
unsigned CopyStmtsNum = 0;
/// Flag to indicate if the Scop is to be skipped.
bool SkipScop = false;
using StmtSet = std::list<ScopStmt>;
/// The statements in this Scop.
@ -2144,12 +2141,6 @@ public:
/// Check if the SCoP has been optimized by the scheduler.
bool isOptimized() const { return IsOptimized; }
/// Mark the SCoP to be skipped by ScopPass passes.
void markAsToBeSkipped() { SkipScop = true; }
/// Check if the SCoP is to be skipped by ScopPass passes.
bool isToBeSkipped() const { return SkipScop; }
/// Return the ID of the Scop
int getID() const { return ID; }

View File

@ -1,42 +0,0 @@
//===- Support/LinkGPURuntime.h -- Headerfile to help force-link GPURuntime =//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This header helps pull in libGPURuntime.so
//
//===----------------------------------------------------------------------===//
#ifndef POLLY_LINK_GPURUNTIME
#define POLLY_LINK_GPURUNTIME
extern "C" {
#include "GPURuntime/GPUJIT.h"
}
namespace polly {
struct ForceGPURuntimeLinking {
ForceGPURuntimeLinking() {
if (std::getenv("bar") != (char *)-1)
return;
// We must reference GPURuntime in such a way that compilers will not
// delete it all as dead code, even with whole program optimization,
// yet is effectively a NO-OP. As the compiler isn't smart enough
// to know that getenv() never returns -1, this will do the job.
polly_initContextCL();
polly_initContextCUDA();
polly_getKernel(nullptr, nullptr);
polly_freeKernel(nullptr);
polly_copyFromHostToDevice(nullptr, nullptr, 0);
polly_copyFromDeviceToHost(nullptr, nullptr, 0);
polly_synchronizeDevice();
polly_launchKernel(nullptr, 0, 0, 0, 0, 0, nullptr);
polly_freeDeviceMemory(nullptr);
polly_freeContext(nullptr);
polly_synchronizeDevice();
}
} structure;
} // namespace polly
#endif

View File

@ -6,13 +6,6 @@ set(ISL_CODEGEN_FILES
CodeGen/IslNodeBuilder.cpp
CodeGen/CodeGeneration.cpp)
if (GPU_CODEGEN)
set (GPGPU_CODEGEN_FILES
CodeGen/PPCGCodeGeneration.cpp
CodeGen/ManagedMemoryRewrite.cpp
)
endif (GPU_CODEGEN)
# Compile ISL into a separate library.
add_subdirectory(External)
@ -44,12 +37,6 @@ set(POLLY_COMPONENTS
Vectorize
)
# Polly-ACC requires the NVPTX backend to work. Ask LLVM about its libraries.
if (GPU_CODEGEN)
# This call emits an error if they NVPTX backend is not enable.
list(APPEND POLLY_COMPONENTS NVPTX)
endif ()
# Use an object-library to add the same files to multiple libs without requiring
# the sources them to be recompiled for each of them.
add_llvm_pass_plugin(Polly
@ -73,7 +60,6 @@ add_llvm_pass_plugin(Polly
CodeGen/Utils.cpp
CodeGen/RuntimeDebugBuilder.cpp
CodeGen/PerfMonitor.cpp
${GPGPU_CODEGEN_FILES}
Exchange/JSONExporter.cpp
Support/GICHelper.cpp
Support/SCEVAffinator.cpp
@ -127,16 +113,6 @@ target_link_libraries(Polly PUBLIC
${ISL_TARGET}
)
# Additional dependencies for Polly-ACC.
if (GPU_CODEGEN)
target_link_libraries(Polly PUBLIC PollyPPCG)
endif ()
if (NOT LLVM_LINK_LLVM_DYLIB AND NOT LLVM_POLLY_LINK_INTO_TOOLS)
# Polly-ACC requires the NVPTX target to be present in the executable it is linked to
set_property(TARGET bugpoint APPEND PROPERTY LINK_LIBRARIES LLVMTarget)
endif ()
# Create a loadable module Polly.so that can be loaded using
# LLVM's/clang's "-load" option.
if (WIN32 OR NOT LLVM_ENABLE_PIC)
@ -150,19 +126,6 @@ else ()
$<TARGET_OBJECTS:obj.Polly>
)
# Only add the dependencies that are not part of LLVM. The latter are assumed
# to be already available in the address space the module is loaded into.
# Adding them once more would have the effect that both copies try to register
# the same command line options, to which LLVM reacts with an error.
# If Polly-ACC is enabled, the NVPTX target is also expected to reside in the
# hosts. This is not the case for bugpoint. Use LLVM_POLLY_LINK_INTO_TOOLS=ON
# instead which will automatically resolve the additional dependencies by
# Polly.
target_link_libraries(LLVMPolly PUBLIC ${ISL_TARGET})
if (GPU_CODEGEN)
target_link_libraries(LLVMPolly PUBLIC PollyPPCG)
endif ()
set_target_properties(LLVMPolly
PROPERTIES
LINKER_LANGUAGE CXX

View File

@ -238,14 +238,8 @@ void BlockGenerator::copyInstScalar(ScopStmt &Stmt, Instruction *Inst,
Builder.Insert(NewInst);
BBMap[Inst] = NewInst;
// When copying the instruction onto the Module meant for the GPU,
// debug metadata attached to an instruction causes all related
// metadata to be pulled into the Module. This includes the DICompileUnit,
// which will not be listed in llvm.dbg.cu of the Module since the Module
// doesn't contain one. This fails the verification of the Module and the
// subsequent generation of the ASM string.
if (NewInst->getModule() != Inst->getModule())
NewInst->setDebugLoc(llvm::DebugLoc());
assert(NewInst->getModule() == Inst->getModule() &&
"Expecting instructions to be in the same module");
if (!NewInst->getType()->isVoidTy())
NewInst->setName("p_" + Inst->getName());

View File

@ -323,10 +323,6 @@ public:
/// Generate LLVM-IR for the SCoP @p S.
bool runOnScop(Scop &S) override {
// Skip SCoPs in case they're already code-generated by PPCGCodeGeneration.
if (S.isToBeSkipped())
return false;
AI = &getAnalysis<IslAstInfoWrapperPass>().getAI();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

View File

@ -638,10 +638,6 @@ isl::ast_build IslAstInfo::getBuild(const isl::ast_node &Node) {
static std::unique_ptr<IslAstInfo> runIslAst(
Scop &Scop,
function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps) {
// Skip SCoPs in case they're already handled by PPCGCodeGeneration.
if (Scop.isToBeSkipped())
return {};
ScopsProcessed++;
const Dependences &D = GetDeps(Dependences::AL_Statement);

View File

@ -1,427 +0,0 @@
//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Take a module and rewrite:
// 1. `malloc` -> `polly_mallocManaged`
// 2. `free` -> `polly_freeManaged`
// 3. global arrays with initializers -> global arrays that are initialized
// with a constructor call to
// `polly_mallocManaged`.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/IRBuilder.h"
#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/DependenceInfo.h"
#include "polly/LinkAllPasses.h"
#include "polly/Options.h"
#include "polly/ScopDetection.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/InitializePasses.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
using namespace llvm;
using namespace polly;
static cl::opt<bool> RewriteAllocas(
"polly-acc-rewrite-allocas",
cl::desc(
"Ask the managed memory rewriter to also rewrite alloca instructions"),
cl::Hidden, cl::cat(PollyCategory));
static cl::opt<bool> IgnoreLinkageForGlobals(
"polly-acc-rewrite-ignore-linkage-for-globals",
cl::desc(
"By default, we only rewrite globals with internal linkage. This flag "
"enables rewriting of globals regardless of linkage"),
cl::Hidden, cl::cat(PollyCategory));
#define DEBUG_TYPE "polly-acc-rewrite-managed-memory"
namespace {
static llvm::Function *getOrCreatePollyMallocManaged(Module &M) {
const char *Name = "polly_mallocManaged";
Function *F = M.getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
PollyIRBuilder Builder(M.getContext());
// TODO: How do I get `size_t`? I assume from DataLayout?
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(),
{Builder.getInt64Ty()}, false);
F = Function::Create(Ty, Linkage, Name, &M);
}
return F;
}
static llvm::Function *getOrCreatePollyFreeManaged(Module &M) {
const char *Name = "polly_freeManaged";
Function *F = M.getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
PollyIRBuilder Builder(M.getContext());
// TODO: How do I get `size_t`? I assume from DataLayout?
FunctionType *Ty =
FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);
F = Function::Create(Ty, Linkage, Name, &M);
}
return F;
}
// Expand a constant expression `Cur`, which is used at instruction `Parent`
// at index `index`.
// Since a constant expression can expand to multiple instructions, store all
// the expands into a set called `Expands`.
// Note that this goes inorder on the constant expression tree.
// A * ((B * D) + C)
// will be processed with first A, then B * D, then B, then D, and then C.
// Though ConstantExprs are not treated as "trees" but as DAGs, since you can
// have something like this:
// *
// / \
// \ /
// (D)
//
// For the purposes of this expansion, we expand the two occurences of D
// separately. Therefore, we expand the DAG into the tree:
// *
// / \
// D D
// TODO: We don't _have_to do this, but this is the simplest solution.
// We can write a solution that keeps track of which constants have been
// already expanded.
static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder,
Instruction *Parent, int index,
SmallPtrSet<Instruction *, 4> &Expands) {
assert(Cur && "invalid constant expression passed");
Instruction *I = Cur->getAsInstruction();
assert(I && "unable to convert ConstantExpr to Instruction");
LLVM_DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur
<< ") in Instruction: (" << *I << ")\n";);
// Invalidate `Cur` so that no one after this point uses `Cur`. Rather,
// they should mutate `I`.
Cur = nullptr;
Expands.insert(I);
Parent->setOperand(index, I);
// The things that `Parent` uses (its operands) should be created
// before `Parent`.
Builder.SetInsertPoint(Parent);
Builder.Insert(I);
for (unsigned i = 0; i < I->getNumOperands(); i++) {
Value *Op = I->getOperand(i);
assert(isa<Constant>(Op) && "constant must have a constant operand");
if (ConstantExpr *CExprOp = dyn_cast<ConstantExpr>(Op))
expandConstantExpr(CExprOp, Builder, I, i, Expands);
}
}
// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite
// `ConstantExpr`s that are used in the `Inst`.
// Note that `replaceAllUsesWith` is insufficient for this purpose because it
// does not rewrite values in `ConstantExpr`s.
static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal,
PollyIRBuilder &Builder) {
// This contains a set of instructions in which OldVal must be replaced.
// We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s
// from `Inst`s arguments.
// We need to go through this process because `replaceAllUsesWith` does not
// actually edit `ConstantExpr`s.
SmallPtrSet<Instruction *, 4> InstsToVisit = {Inst};
// Expand all `ConstantExpr`s and place it in `InstsToVisit`.
for (unsigned i = 0; i < Inst->getNumOperands(); i++) {
Value *Operand = Inst->getOperand(i);
if (ConstantExpr *ValueConstExpr = dyn_cast<ConstantExpr>(Operand))
expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit);
}
// Now visit each instruction and use `replaceUsesOfWith`. We know that
// will work because `I` cannot have any `ConstantExpr` within it.
for (Instruction *I : InstsToVisit)
I->replaceUsesOfWith(OldVal, NewVal);
}
// Given a value `Current`, return all Instructions that may contain `Current`
// in an expression.
// We need this auxiliary function, because if we have a
// `Constant` that is a user of `V`, we need to recurse into the
// `Constant`s uses to gather the root instruction.
static void getInstructionUsersOfValue(Value *V,
SmallVector<Instruction *, 4> &Owners) {
if (auto *I = dyn_cast<Instruction>(V)) {
Owners.push_back(I);
} else {
// Anything that is a `User` must be a constant or an instruction.
auto *C = cast<Constant>(V);
for (Use &CUse : C->uses())
getInstructionUsersOfValue(CUse.getUser(), Owners);
}
}
static void
replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array,
SmallPtrSet<GlobalVariable *, 4> &ReplacedGlobals) {
// We only want arrays.
ArrayType *ArrayTy = dyn_cast<ArrayType>(Array.getValueType());
if (!ArrayTy)
return;
Type *ElemTy = ArrayTy->getElementType();
PointerType *ElemPtrTy = ElemTy->getPointerTo();
// We only wish to replace arrays that are visible in the module they
// inhabit. Otherwise, our type edit from [T] to T* would be illegal across
// modules.
const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() ||
Array.hasInternalLinkage() ||
IgnoreLinkageForGlobals;
if (!OnlyVisibleInsideModule) {
LLVM_DEBUG(
dbgs() << "Not rewriting (" << Array
<< ") to managed memory "
"because it could be visible externally. To force rewrite, "
"use -polly-acc-rewrite-ignore-linkage-for-globals.\n");
return;
}
if (!Array.hasInitializer() ||
!isa<ConstantAggregateZero>(Array.getInitializer())) {
LLVM_DEBUG(dbgs() << "Not rewriting (" << Array
<< ") to managed memory "
"because it has an initializer which is "
"not a zeroinitializer.\n");
return;
}
// At this point, we have committed to replacing this array.
ReplacedGlobals.insert(&Array);
std::string NewName = Array.getName().str();
NewName += ".toptr";
GlobalVariable *ReplacementToArr =
cast<GlobalVariable>(M.getOrInsertGlobal(NewName, ElemPtrTy));
ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy));
Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
std::string FnName = Array.getName().str();
FnName += ".constructor";
PollyIRBuilder Builder(M.getContext());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Function *F = Function::Create(Ty, Linkage, FnName, &M);
BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F);
Builder.SetInsertPoint(Start);
const uint64_t ArraySizeInt = DL.getTypeAllocSize(ArrayTy);
Value *ArraySize = Builder.getInt64(ArraySizeInt);
ArraySize->setName("array.size");
Value *AllocatedMemRaw =
Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw");
Value *AllocatedMemTyped =
Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed");
Builder.CreateStore(AllocatedMemTyped, ReplacementToArr);
Builder.CreateRetVoid();
const int Priority = 0;
appendToGlobalCtors(M, F, Priority, ReplacementToArr);
SmallVector<Instruction *, 4> ArrayUserInstructions;
// Get all instructions that use array. We need to do this weird thing
// because `Constant`s that contain this array neeed to be expanded into
// instructions so that we can replace their parameters. `Constant`s cannot
// be edited easily, so we choose to convert all `Constant`s to
// `Instruction`s and handle all of the uses of `Array` uniformly.
for (Use &ArrayUse : Array.uses())
getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions);
for (Instruction *UserOfArrayInst : ArrayUserInstructions) {
Builder.SetInsertPoint(UserOfArrayInst);
// <ty>** -> <ty>*
Value *ArrPtrLoaded =
Builder.CreateLoad(ElemPtrTy, ReplacementToArr, "arrptr.load");
// <ty>* -> [ty]*
Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast(
ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast");
rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder);
}
}
// We return all `allocas` that may need to be converted to a call to
// cudaMallocManaged.
static void getAllocasToBeManaged(Function &F,
SmallSet<AllocaInst *, 4> &Allocas) {
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
auto *Alloca = dyn_cast<AllocaInst>(&I);
if (!Alloca)
continue;
LLVM_DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: ");
if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false,
/* StoreCaptures */ true)) {
Allocas.insert(Alloca);
LLVM_DEBUG(dbgs() << "YES (captured).\n");
} else {
LLVM_DEBUG(dbgs() << "NO (not captured).\n");
}
}
}
}
static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca,
const DataLayout &DL) {
LLVM_DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n");
Module *M = Alloca->getModule();
assert(M && "Alloca does not have a module");
PollyIRBuilder Builder(M->getContext());
Builder.SetInsertPoint(Alloca);
Function *MallocManagedFn =
getOrCreatePollyMallocManaged(*Alloca->getModule());
const uint64_t Size = DL.getTypeAllocSize(Alloca->getAllocatedType());
Value *SizeVal = Builder.getInt64(Size);
Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal});
Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType());
Function *F = Alloca->getFunction();
assert(F && "Alloca has invalid function");
Bitcasted->takeName(Alloca);
Alloca->replaceAllUsesWith(Bitcasted);
Alloca->eraseFromParent();
for (BasicBlock &BB : *F) {
ReturnInst *Return = dyn_cast<ReturnInst>(BB.getTerminator());
if (!Return)
continue;
Builder.SetInsertPoint(Return);
Function *FreeManagedFn = getOrCreatePollyFreeManaged(*M);
Builder.CreateCall(FreeManagedFn, {RawManagedMem});
}
}
// Replace all uses of `Old` with `New`, even inside `ConstantExpr`.
//
// `replaceAllUsesWith` does replace values in `ConstantExpr`. This function
// actually does replace it in `ConstantExpr`. The caveat is that if there is
// a use that is *outside* a function (say, at global declarations), we fail.
// So, this is meant to be used on values which we know will only be used
// within functions.
//
// This process works by looking through the uses of `Old`. If it finds a
// `ConstantExpr`, it recursively looks for the owning instruction.
// Then, it expands all the `ConstantExpr` to instructions and replaces
// `Old` with `New` in the expanded instructions.
static void replaceAllUsesAndConstantUses(Value *Old, Value *New,
PollyIRBuilder &Builder) {
SmallVector<Instruction *, 4> UserInstructions;
// Get all instructions that use array. We need to do this weird thing
// because `Constant`s that contain this array neeed to be expanded into
// instructions so that we can replace their parameters. `Constant`s cannot
// be edited easily, so we choose to convert all `Constant`s to
// `Instruction`s and handle all of the uses of `Array` uniformly.
for (Use &ArrayUse : Old->uses())
getInstructionUsersOfValue(ArrayUse.getUser(), UserInstructions);
for (Instruction *I : UserInstructions)
rewriteOldValToNew(I, Old, New, Builder);
}
class ManagedMemoryRewritePass final : public ModulePass {
public:
static char ID;
GPUArch Architecture;
GPURuntime Runtime;
ManagedMemoryRewritePass() : ModulePass(ID) {}
bool runOnModule(Module &M) override {
const DataLayout &DL = M.getDataLayout();
Function *Malloc = M.getFunction("malloc");
if (Malloc) {
PollyIRBuilder Builder(M.getContext());
Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
assert(PollyMallocManaged && "unable to create polly_mallocManaged");
replaceAllUsesAndConstantUses(Malloc, PollyMallocManaged, Builder);
Malloc->eraseFromParent();
}
Function *Free = M.getFunction("free");
if (Free) {
PollyIRBuilder Builder(M.getContext());
Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M);
assert(PollyFreeManaged && "unable to create polly_freeManaged");
replaceAllUsesAndConstantUses(Free, PollyFreeManaged, Builder);
Free->eraseFromParent();
}
SmallPtrSet<GlobalVariable *, 4> GlobalsToErase;
for (GlobalVariable &Global : M.globals())
replaceGlobalArray(M, DL, Global, GlobalsToErase);
for (GlobalVariable *G : GlobalsToErase)
G->eraseFromParent();
// Rewrite allocas to cudaMallocs if we are asked to do so.
if (RewriteAllocas) {
SmallSet<AllocaInst *, 4> AllocasToBeManaged;
for (Function &F : M.functions())
getAllocasToBeManaged(F, AllocasToBeManaged);
for (AllocaInst *Alloca : AllocasToBeManaged)
rewriteAllocaAsManagedMemory(Alloca, DL);
}
return true;
}
};
} // namespace
char ManagedMemoryRewritePass::ID = 42;
Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch,
GPURuntime Runtime) {
ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass();
pass->Runtime = Runtime;
pass->Architecture = Arch;
return pass;
}
INITIALIZE_PASS_BEGIN(
ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
"Polly - Rewrite all allocations in heap & data section to managed memory",
false, false)
INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration);
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
INITIALIZE_PASS_END(
ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
"Polly - Rewrite all allocations in heap & data section to managed memory",
false, false)

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,6 @@
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/RuntimeDebugBuilder.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
#include <string>
#include <vector>
@ -17,6 +16,16 @@
using namespace llvm;
using namespace polly;
llvm::Value *RuntimeDebugBuilder::getPrintableString(PollyIRBuilder &Builder,
llvm::StringRef Str) {
// FIXME: addressspace(4) is a marker for a string (for the %s conversion
// specifier) but should be using the default address space. This only works
// because CPU backends typically ignore the address space. For constant
// strings as returned by getPrintableString, the format string should instead
// directly spell out the string.
return Builder.CreateGlobalStringPtr(Str, "", 4);
}
Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
const char *Name = "vprintf";
@ -33,72 +42,9 @@ Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
return F;
}
Function *RuntimeDebugBuilder::getAddressSpaceCast(PollyIRBuilder &Builder,
unsigned Src, unsigned Dst,
unsigned SrcBits,
unsigned DstBits) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto Name = std::string("llvm.nvvm.ptr.constant.to.gen.p") +
std::to_string(Dst) + "i" + std::to_string(DstBits) + ".p" +
std::to_string(Src) + "i" + std::to_string(SrcBits);
Function *F = M->getFunction(Name);
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(
PointerType::get(Builder.getIntNTy(DstBits), Dst),
PointerType::get(Builder.getIntNTy(SrcBits), Src), false);
F = Function::Create(Ty, Linkage, Name, M);
}
return F;
}
std::vector<Value *>
RuntimeDebugBuilder::getGPUThreadIdentifiers(PollyIRBuilder &Builder) {
std::vector<Value *> Identifiers;
auto M = Builder.GetInsertBlock()->getParent()->getParent();
std::vector<Function *> BlockIDs = {
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_x),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_y),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_z),
};
Identifiers.push_back(Builder.CreateGlobalStringPtr("> block-id: ", "", 4));
for (auto GetID : BlockIDs) {
Value *Id = Builder.CreateCall(GetID, {});
Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
Identifiers.push_back(Id);
Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
}
Identifiers.push_back(Builder.CreateGlobalStringPtr("| ", "", 4));
std::vector<Function *> ThreadIDs = {
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_x),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_y),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_z),
};
Identifiers.push_back(Builder.CreateGlobalStringPtr("thread-id: ", "", 4));
for (auto GetId : ThreadIDs) {
Value *Id = Builder.CreateCall(GetId, {});
Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
Identifiers.push_back(Id);
Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
}
return Identifiers;
}
void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, bool IsGPU,
void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder,
ArrayRef<Value *> Values) {
if (IsGPU)
createGPUPrinterT(Builder, Values);
else
createCPUPrinterT(Builder, Values);
createCPUPrinterT(Builder, Values);
}
bool RuntimeDebugBuilder::isPrintable(Type *Ty) {
@ -169,78 +115,6 @@ void RuntimeDebugBuilder::createCPUPrinterT(PollyIRBuilder &Builder,
createFlush(Builder);
}
void RuntimeDebugBuilder::createGPUPrinterT(PollyIRBuilder &Builder,
ArrayRef<Value *> Values) {
std::string str;
auto *Zero = Builder.getInt64(0);
auto ToPrint = getGPUThreadIdentifiers(Builder);
ToPrint.push_back(Builder.CreateGlobalStringPtr("\n ", "", 4));
ToPrint.insert(ToPrint.end(), Values.begin(), Values.end());
const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
// Allocate print buffer (assuming 2*32 bit per element)
auto T = ArrayType::get(Builder.getInt32Ty(), ToPrint.size() * 2);
Value *Data = new AllocaInst(
T, DL.getAllocaAddrSpace(), "polly.vprint.buffer",
&Builder.GetInsertBlock()->getParent()->getEntryBlock().front());
auto *DataPtr = Builder.CreateGEP(T, Data, {Zero, Zero});
int Offset = 0;
for (auto Val : ToPrint) {
auto Ptr = Builder.CreateGEP(Builder.getInt32Ty(), DataPtr,
Builder.getInt64(Offset));
Type *Ty = Val->getType();
if (Ty->isFloatingPointTy()) {
if (!Ty->isDoubleTy())
Val = Builder.CreateFPExt(Val, Builder.getDoubleTy());
} else if (Ty->isIntegerTy()) {
if (Ty->getIntegerBitWidth() < 64) {
Val = Builder.CreateSExt(Val, Builder.getInt64Ty());
} else {
assert(Ty->getIntegerBitWidth() == 64 &&
"Integer types larger 64 bit not supported");
// fallthrough
}
} else if (isa<PointerType>(Ty)) {
if (Ty == Builder.getInt8PtrTy(4)) {
// Pointers in constant address space are printed as strings
Val = Builder.CreateGEP(Builder.getInt8Ty(), Val, Builder.getInt64(0));
auto F = RuntimeDebugBuilder::getAddressSpaceCast(Builder, 4, 0);
Val = Builder.CreateCall(F, Val);
} else {
Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty());
}
} else {
llvm_unreachable("Unknown type");
}
Ty = Val->getType();
Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Ty->getPointerTo(5));
Builder.CreateAlignedStore(Val, Ptr, Align(4));
if (Ty->isFloatingPointTy())
str += "%f";
else if (Ty->isIntegerTy())
str += "%ld";
else
str += "%s";
Offset += 2;
}
Value *Format = Builder.CreateGlobalStringPtr(str, "polly.vprintf.buffer", 4);
Format = Builder.CreateCall(getAddressSpaceCast(Builder, 4, 0), Format);
Data = Builder.CreateBitCast(Data, Builder.getInt8PtrTy());
Builder.CreateCall(getVPrintF(Builder), {Format, Data});
}
Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
const char *Name = "printf";

View File

@ -314,91 +314,3 @@ if (POLLY_BUNDLED_ISL)
target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
endif (POLLY_BUNDLED_ISL)
# External: Polyhedral Parallel Code Generator
if (GPU_CODEGEN)
set(PET_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pet")
set(PPCG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ppcg")
set(PPCG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/ppcg")
# Determine version of ppcg
if (EXISTS "${PPCG_SOURCE_DIR}/GIT_HEAD_ID")
# The source comes from a 'make dist' archive
file(READ "${PPCG_SOURCE_DIR}/GIT_HEAD_ID" PPCG_GIT_HEAD_ID)
string(STRIP "${PPCG_GIT_HEAD_ID}" PPCG_GIT_HEAD_ID)
elseif (EXISTS "${PPCG_SOURCE_DIR}/gitversion.h")
# The source directory is preconfigured
file(READ "${PPCG_SOURCE_DIR}/gitversion.h" GITVERSION_H)
string(REGEX REPLACE ".*\\\"([^\\\"]*)\\\".*" "\\1" PPCG_GIT_HEAD_ID "${GITVERSION_H}")
elseif ()
# Unknown revision
# TODO: We could look for a .git and get the revision from HEAD
set(PPCG_GIT_HEAD_ID "UNKNOWN")
endif ()
message(STATUS "PPCG version: ${PPCG_GIT_HEAD_ID}")
set (PPCG_FILES
ppcg/cuda.c
ppcg/cuda_common.c
ppcg/external.c
ppcg/gpu_array_tile.c
ppcg/gpu.c
ppcg/gpu_array_tile.c
ppcg/gpu_group.c
ppcg/gpu_hybrid.c
ppcg/gpu_print.c
ppcg/gpu_tree.c
ppcg/grouping.c
ppcg/hybrid.c
ppcg/ppcg.c
ppcg/ppcg_options.c
ppcg/print.c
ppcg/schedule.c
ppcg/util.c
)
include_directories(BEFORE
${PPCG_BINARY_DIR}
${PPCG_SOURCE_DIR}/imath
${PPCG_SOURCE_DIR}/include
${PET_SOURCE_DIR}/include
)
add_polly_library(PollyPPCG
${PPCG_FILES}
)
target_link_libraries(PollyPPCG PUBLIC ${ISL_TARGET})
# Disable warnings for upstream projects.
if (MSVC)
set(DISABLE_WARNING_FLAGS
-wd4018 # 'expression' : signed/unsigned mismatch
-wd4090 # 'operation' : different 'modifier' qualifiers
-wd4200 # nonstandard extension used: zero-sized array in struct/union
-wd4201 # nonstandard extension used: nameless struct/union
-wd4334 # 'operator': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
-wd4221 # nonstandard extension used : 'identifier' : cannot be initialized using address of automatic variable
)
if (POLLY_BUNDLED_ISL)
target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
endif (POLLY_BUNDLED_ISL)
target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
else ()
if (POLLY_BUNDLED_ISL)
set_target_properties(PollyISL polly-isl-test PROPERTIES COMPILE_FLAGS "-w")
endif (POLLY_BUNDLED_ISL)
set_target_properties(PollyPPCG PROPERTIES COMPILE_FLAGS "-w")
endif ()
if(MSVC)
# In the Windows API (with some exceptions), the maximum length for a path is
# MAX_PATH, which is defined as 260 characters.
target_compile_definitions(PollyPPCG PRIVATE "-DPATH_MAX=260")
endif ()
target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
endif ()

View File

@ -1,622 +0,0 @@
#ifndef PET_H
#define PET_H
#include <isl/aff.h>
#include <isl/arg.h>
#include <isl/ast_build.h>
#include <isl/set.h>
#include <isl/map.h>
#include <isl/union_map.h>
#include <isl/printer.h>
#include <isl/id_to_ast_expr.h>
#include <isl/id_to_pw_aff.h>
#include <isl/schedule.h>
#if defined(__cplusplus)
extern "C" {
#endif
struct pet_options;
ISL_ARG_DECL(pet_options, struct pet_options, pet_options_args)
/* Create an isl_ctx that references the pet options. */
isl_ctx *isl_ctx_alloc_with_pet_options();
/* If autodetect is set, any valid scop is extracted.
* Otherwise, the scop needs to be delimited by pragmas.
*/
int pet_options_set_autodetect(isl_ctx *ctx, int val);
int pet_options_get_autodetect(isl_ctx *ctx);
int pet_options_set_detect_conditional_assignment(isl_ctx *ctx, int val);
int pet_options_get_detect_conditional_assignment(isl_ctx *ctx);
/* If encapsulate-dynamic-control is set, then any dynamic control
* in the input program will be encapsulated in macro statements.
* This means in particular that no statements with arguments
* will be created.
*/
int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val);
int pet_options_get_encapsulate_dynamic_control(isl_ctx *ctx);
#define PET_OVERFLOW_AVOID 0
#define PET_OVERFLOW_IGNORE 1
int pet_options_set_signed_overflow(isl_ctx *ctx, int val);
int pet_options_get_signed_overflow(isl_ctx *ctx);
struct pet_loc;
typedef struct pet_loc pet_loc;
/* Return an additional reference to "loc". */
__isl_give pet_loc *pet_loc_copy(__isl_keep pet_loc *loc);
/* Free a reference to "loc". */
pet_loc *pet_loc_free(__isl_take pet_loc *loc);
/* Return the offset in the input file of the start of "loc". */
unsigned pet_loc_get_start(__isl_keep pet_loc *loc);
/* Return the offset in the input file of the character after "loc". */
unsigned pet_loc_get_end(__isl_keep pet_loc *loc);
/* Return the line number of a line within the "loc" region. */
int pet_loc_get_line(__isl_keep pet_loc *loc);
/* Return the indentation of the "loc" region. */
__isl_keep const char *pet_loc_get_indent(__isl_keep pet_loc *loc);
enum pet_expr_type {
pet_expr_error = -1,
pet_expr_access,
pet_expr_call,
pet_expr_cast,
pet_expr_int,
pet_expr_double,
pet_expr_op
};
enum pet_op_type {
/* only compound assignments operators before assignment */
pet_op_add_assign,
pet_op_sub_assign,
pet_op_mul_assign,
pet_op_div_assign,
pet_op_and_assign,
pet_op_xor_assign,
pet_op_or_assign,
pet_op_assign,
pet_op_add,
pet_op_sub,
pet_op_mul,
pet_op_div,
pet_op_mod,
pet_op_shl,
pet_op_shr,
pet_op_eq,
pet_op_ne,
pet_op_le,
pet_op_ge,
pet_op_lt,
pet_op_gt,
pet_op_minus,
pet_op_post_inc,
pet_op_post_dec,
pet_op_pre_inc,
pet_op_pre_dec,
pet_op_address_of,
pet_op_assume,
pet_op_kill,
pet_op_and,
pet_op_xor,
pet_op_or,
pet_op_not,
pet_op_land,
pet_op_lor,
pet_op_lnot,
pet_op_cond,
pet_op_last
};
/* Index into the pet_expr->args array when pet_expr->type == pet_expr_unary
*/
enum pet_un_arg_type {
pet_un_arg
};
/* Indices into the pet_expr->args array when
* pet_expr->type == pet_expr_binary
*/
enum pet_bin_arg_type {
pet_bin_lhs,
pet_bin_rhs
};
/* Indices into the pet_expr->args array when
* pet_expr->type == pet_expr_ternary
*/
enum pet_ter_arg_type {
pet_ter_cond,
pet_ter_true,
pet_ter_false
};
struct pet_expr;
typedef struct pet_expr pet_expr;
/* Return an additional reference to "expr". */
__isl_give pet_expr *pet_expr_copy(__isl_keep pet_expr *expr);
/* Free a reference to "expr". */
__isl_null pet_expr *pet_expr_free(__isl_take pet_expr *expr);
/* Return the isl_ctx in which "expr" was created. */
isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr);
/* Return the type of "expr". */
enum pet_expr_type pet_expr_get_type(__isl_keep pet_expr *expr);
/* Return the number of arguments of "expr". */
int pet_expr_get_n_arg(__isl_keep pet_expr *expr);
/* Set the number of arguments of "expr" to "n". */
__isl_give pet_expr *pet_expr_set_n_arg(__isl_take pet_expr *expr, int n);
/* Return the argument of "expr" at position "pos". */
__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos);
/* Replace the argument of "expr" at position "pos" by "arg". */
__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
__isl_take pet_expr *arg);
/* Return the operation type of operation expression "expr". */
enum pet_op_type pet_expr_op_get_type(__isl_keep pet_expr *expr);
/* Replace the operation type of operation expression "expr" by "type". */
__isl_give pet_expr *pet_expr_op_set_type(__isl_take pet_expr *expr,
enum pet_op_type type);
/* Construct a (read) access pet_expr from an index expression. */
__isl_give pet_expr *pet_expr_from_index(__isl_take isl_multi_pw_aff *index);
/* Does "expr" represent an affine expression? */
isl_bool pet_expr_is_affine(__isl_keep pet_expr *expr);
/* Does the access expression "expr" read the accessed elements? */
isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr);
/* Does the access expression "expr" write to the accessed elements? */
isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr);
/* Does the access expression "expr" kill the accessed elements? */
isl_bool pet_expr_access_is_kill(__isl_keep pet_expr *expr);
/* Mark "expr" as a read depending on "read". */
__isl_give pet_expr *pet_expr_access_set_read(__isl_take pet_expr *expr,
int read);
/* Mark "expr" as a write depending on "write". */
__isl_give pet_expr *pet_expr_access_set_write(__isl_take pet_expr *expr,
int write);
/* Mark "expr" as a kill depending on "kill". */
__isl_give pet_expr *pet_expr_access_set_kill(__isl_take pet_expr *expr,
int kill);
/* Return the reference identifier of access expression "expr". */
__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr);
/* Replace the reference identifier of access expression "expr" by "ref_id". */
__isl_give pet_expr *pet_expr_access_set_ref_id(__isl_take pet_expr *expr,
__isl_take isl_id *ref_id);
/* Return the identifier of the outer array accessed by "expr". */
__isl_give isl_id *pet_expr_access_get_id(__isl_keep pet_expr *expr);
/* Return the index expression of access expression "expr". */
__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
__isl_keep pet_expr *expr);
/* Return the potential read access relation of access expression "expr". */
__isl_give isl_union_map *pet_expr_access_get_may_read(
__isl_keep pet_expr *expr);
/* Return the potential write access relation of access expression "expr". */
__isl_give isl_union_map *pet_expr_access_get_may_write(
__isl_keep pet_expr *expr);
/* Return the definite write access relation of access expression "expr". */
__isl_give isl_union_map *pet_expr_access_get_must_write(
__isl_keep pet_expr *expr);
/* Return the argument dependent potential read access relation of "expr". */
__isl_give isl_union_map *pet_expr_access_get_dependent_may_read(
__isl_keep pet_expr *expr);
/* Return the argument dependent potential write access relation of "expr". */
__isl_give isl_union_map *pet_expr_access_get_dependent_may_write(
__isl_keep pet_expr *expr);
/* Return the argument dependent definite write access relation of "expr". */
__isl_give isl_union_map *pet_expr_access_get_dependent_must_write(
__isl_keep pet_expr *expr);
/* Return the tagged potential read access relation of access "expr". */
__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
__isl_keep pet_expr *expr);
/* Return the tagged potential write access relation of access "expr". */
__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
__isl_keep pet_expr *expr);
/* Return the name of the function called by "expr". */
__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr);
/* Replace the name of the function called by "expr" by "name". */
__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
__isl_keep const char *name);
/* Create a pet_expr representing a cast of "arg" to "type_name". */
__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
__isl_take pet_expr *arg);
/* Replace the type of the cast performed by "expr" by "name". */
__isl_give pet_expr *pet_expr_cast_set_type_name(__isl_take pet_expr *expr,
__isl_keep const char *name);
/* Return the value of the integer represented by "expr". */
__isl_give isl_val *pet_expr_int_get_val(__isl_keep pet_expr *expr);
/* Replace the value of the integer represented by "expr" by "v". */
__isl_give pet_expr *pet_expr_int_set_val(__isl_take pet_expr *expr,
__isl_take isl_val *v);
/* Return a string representation of the double expression "expr". */
__isl_give char *pet_expr_double_get_str(__isl_keep pet_expr *expr);
/* Replace value and string representation of the double expression "expr" */
__isl_give pet_expr *pet_expr_double_set(__isl_take pet_expr *expr,
double d, __isl_keep const char *s);
/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_access. */
int pet_expr_foreach_access_expr(__isl_keep pet_expr *expr,
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_call. */
int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
struct pet_context;
typedef struct pet_context pet_context;
/* Create a context with the given domain. */
__isl_give pet_context *pet_context_alloc(__isl_take isl_set *domain);
/* Return an additional reference to "pc". */
__isl_give pet_context *pet_context_copy(__isl_keep pet_context *pc);
/* Free a reference to "pc". */
__isl_null pet_context *pet_context_free(__isl_take pet_context *pc);
/* Return the isl_ctx in which "pc" was created. */
isl_ctx *pet_context_get_ctx(__isl_keep pet_context *pc);
/* Extract an affine expression defined over the domain of "pc" from "expr"
* or return NaN.
*/
__isl_give isl_pw_aff *pet_expr_extract_affine(__isl_keep pet_expr *expr,
__isl_keep pet_context *pc);
void pet_expr_dump(__isl_keep pet_expr *expr);
enum pet_tree_type {
pet_tree_error = -1,
pet_tree_expr,
pet_tree_block,
pet_tree_break,
pet_tree_continue,
pet_tree_decl, /* A declaration without initialization */
pet_tree_decl_init, /* A declaration with initialization */
pet_tree_if, /* An if without an else branch */
pet_tree_if_else, /* An if with an else branch */
pet_tree_for,
pet_tree_infinite_loop,
pet_tree_while,
pet_tree_return,
};
struct pet_tree;
typedef struct pet_tree pet_tree;
/* Return the isl_ctx in which "tree" was created. */
isl_ctx *pet_tree_get_ctx(__isl_keep pet_tree *tree);
/* Return an additional reference to "tree". */
__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree);
/* Free a reference to "tree". */
__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree);
/* Return the location of "tree". */
__isl_give pet_loc *pet_tree_get_loc(__isl_keep pet_tree *tree);
/* Return the type of "tree". */
enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree);
/* Return the expression of the expression tree "tree". */
__isl_give pet_expr *pet_tree_expr_get_expr(__isl_keep pet_tree *tree);
/* Return the expression returned by the return tree "tree". */
__isl_give pet_expr *pet_tree_return_get_expr(__isl_keep pet_tree *tree);
/* Return the number of children of the block tree "tree". */
int pet_tree_block_n_child(__isl_keep pet_tree *tree);
/* Return child "pos" of the block tree "tree". */
__isl_give pet_tree *pet_tree_block_get_child(__isl_keep pet_tree *tree,
int pos);
/* Is "tree" a declaration (with or without initialization)? */
int pet_tree_is_decl(__isl_keep pet_tree *tree);
/* Return the variable declared by the declaration tree "tree". */
__isl_give pet_expr *pet_tree_decl_get_var(__isl_keep pet_tree *tree);
/* Return the initial value of the pet_tree_decl_init tree "tree". */
__isl_give pet_expr *pet_tree_decl_get_init(__isl_keep pet_tree *tree);
/* Return the condition of the if tree "tree". */
__isl_give pet_expr *pet_tree_if_get_cond(__isl_keep pet_tree *tree);
/* Return the then branch of the if tree "tree". */
__isl_give pet_tree *pet_tree_if_get_then(__isl_keep pet_tree *tree);
/* Return the else branch of the if tree with else branch "tree". */
__isl_give pet_tree *pet_tree_if_get_else(__isl_keep pet_tree *tree);
/* Is "tree" a for loop, a while loop or an infinite loop? */
int pet_tree_is_loop(__isl_keep pet_tree *tree);
/* Return the induction variable of the for loop "tree" */
__isl_give pet_expr *pet_tree_loop_get_var(__isl_keep pet_tree *tree);
/* Return the initial value of the induction variable of the for loop "tree" */
__isl_give pet_expr *pet_tree_loop_get_init(__isl_keep pet_tree *tree);
/* Return the condition of the loop tree "tree" */
__isl_give pet_expr *pet_tree_loop_get_cond(__isl_keep pet_tree *tree);
/* Return the induction variable of the for loop "tree" */
__isl_give pet_expr *pet_tree_loop_get_inc(__isl_keep pet_tree *tree);
/* Return the body of the loop tree "tree" */
__isl_give pet_tree *pet_tree_loop_get_body(__isl_keep pet_tree *tree);
/* Call "fn" on each top-level expression in the nodes of "tree" */
int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
/* Call "fn" on each access subexpression in the nodes of "tree" */
int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
/* Modify all call subexpressions in the nodes of "tree" through "fn". */
__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
__isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
void *user);
void pet_tree_dump(__isl_keep pet_tree *tree);
/* "loc" represents the region of the source code that is represented
* by this statement.
*
* If the statement has arguments, i.e., n_arg != 0, then
* "domain" is a wrapped map, mapping the iteration domain
* to the values of the arguments for which this statement
* is executed.
* Otherwise, it is simply the iteration domain.
*
* If one of the arguments is an access expression that accesses
* more than one element for a given iteration, then the constraints
* on the value of this argument (encoded in "domain") should be satisfied
* for all of those accessed elements.
*/
struct pet_stmt {
pet_loc *loc;
isl_set *domain;
pet_tree *body;
unsigned n_arg;
pet_expr **args;
};
/* Return the iteration space of "stmt". */
__isl_give isl_space *pet_stmt_get_space(struct pet_stmt *stmt);
/* Is "stmt" an assignment statement? */
int pet_stmt_is_assign(struct pet_stmt *stmt);
/* Is "stmt" a kill statement? */
int pet_stmt_is_kill(struct pet_stmt *stmt);
/* pet_stmt_build_ast_exprs is currently limited to only handle
* some forms of data dependent accesses.
* If pet_stmt_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
* can safely be called on "stmt".
*/
int pet_stmt_can_build_ast_exprs(struct pet_stmt *stmt);
/* Construct an associative array from reference identifiers of
* access expressions in "stmt" to the corresponding isl_ast_expr.
* Each index expression is first transformed through "fn_index"
* (if not NULL). Then an AST expression is generated using "build".
* Finally, the AST expression is transformed using "fn_expr"
* (if not NULL).
*/
__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
__isl_keep isl_ast_build *build,
__isl_give isl_multi_pw_aff *(*fn_index)(
__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
void *user), void *user_index,
__isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
__isl_keep isl_id *id, void *user), void *user_expr);
/* Print "stmt" to "p".
*
* The access expressions in "stmt" are replaced by the isl_ast_expr
* associated to its reference identifier in "ref2expr".
*/
__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
__isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr);
/* This structure represents a defined type.
* "name" is the name of the type, while "definition" is a string
* representation of its definition.
*/
struct pet_type {
char *name;
char *definition;
};
/* context holds constraints on the parameter that ensure that
* this array has a valid (i.e., non-negative) size
*
* extent holds constraints on the indices
*
* value_bounds holds constraints on the elements of the array
* and may be NULL if no such constraints were specified by the user
*
* element_size is the size in bytes of each array element
* element_type is the type of the array elements.
* element_is_record is set if this type is a record type.
*
* live_out is set if the array appears in a live-out pragma
*
* if uniquely_defined is set then the array is written by a single access
* such that any element that is ever read
* is known to be assigned exactly once before the read
*
* declared is set if the array was declared somewhere inside the scop.
* exposed is set if the declared array is visible outside the scop.
* outer is set if the type of the array elements is a record and
* the fields of this record are represented by separate pet_array structures.
*/
struct pet_array {
isl_set *context;
isl_set *extent;
isl_set *value_bounds;
char *element_type;
int element_is_record;
int element_size;
int live_out;
int uniquely_defined;
int declared;
int exposed;
int outer;
};
/* This structure represents an implication on a boolean filter.
* In particular, if the filter value of an element in the domain
* of "extension" is equal to "satisfied", then the filter values
* of the corresponding images in "extension" are also equal
* to "satisfied".
*/
struct pet_implication {
int satisfied;
isl_map *extension;
};
/* This structure represents an independence implied by a for loop
* that is marked as independent in the source code.
* "filter" contains pairs of statement instances that are guaranteed
* not to be dependent on each other based on the independent for loop,
* assuming that no dependences carried by this loop are implied
* by the variables in "local".
* "local" contains the variables that are local to the loop that was
* marked independent.
*/
struct pet_independence {
isl_union_map *filter;
isl_union_set *local;
};
/* "loc" represents the region of the source code that is represented
* by this scop.
* If the scop was detected based on scop and endscop pragmas, then
* the lines containing these pragmas are included in this region.
* In the final result, the context describes the set of parameter values
* for which the scop can be executed.
* During the construction of the pet_scop, the context lives in a set space
* where each dimension refers to an outer loop.
* context_value describes assignments to the parameters (if any)
* outside of the scop.
*
* "schedule" is the schedule of the statements in the scop.
*
* The n_type types define types that may be referenced from by the arrays.
*
* The n_implication implications describe implications on boolean filters.
*
* The n_independence independences describe independences implied
* by for loops that are marked independent in the source code.
*/
struct pet_scop {
pet_loc *loc;
isl_set *context;
isl_set *context_value;
isl_schedule *schedule;
int n_type;
struct pet_type **types;
int n_array;
struct pet_array **arrays;
int n_stmt;
struct pet_stmt **stmts;
int n_implication;
struct pet_implication **implications;
int n_independence;
struct pet_independence **independences;
};
typedef struct pet_scop pet_scop;
/* Return a textual representation of the operator. */
const char *pet_op_str(enum pet_op_type op);
int pet_op_is_inc_dec(enum pet_op_type op);
/* Extract a pet_scop from a C source file.
* If function is not NULL, then the pet_scop is extracted from
* a function with that name.
*/
__isl_give pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx,
const char *filename, const char *function);
/* Transform the C source file "input" by rewriting each scop
* When autodetecting scops, at most one scop per function is rewritten.
* The transformed C code is written to "output".
*/
int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
__isl_take pet_scop *scop, void *user), void *user);
/* Given a scop and a printer passed to a pet_transform_C_source callback,
* print the original corresponding code to the printer.
*/
__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
__isl_take isl_printer *p);
/* Update all isl_sets and isl_maps such that they all have the same
* parameters in the same order.
*/
__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop);
/* Does "scop" contain any data dependent accesses? */
int pet_scop_has_data_dependent_accesses(__isl_keep pet_scop *scop);
/* Does "scop" contain any data dependent conditions? */
int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop);
/* pet_stmt_build_ast_exprs is currently limited to only handle
* some forms of data dependent accesses.
* If pet_scop_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
* can safely be called on all statements in the scop.
*/
int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop);
void pet_scop_dump(__isl_keep pet_scop *scop);
__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop);
/* Return the context of "scop". */
__isl_give isl_set *pet_scop_get_context(__isl_keep pet_scop *scop);
/* Return the schedule of "scop". */
__isl_give isl_schedule *pet_scop_get_schedule(__isl_keep pet_scop *scop);
/* Return the set of all statement instances. */
__isl_give isl_union_set *pet_scop_get_instance_set(__isl_keep pet_scop *scop);
/* Return the potential read access relation. */
__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop);
/* Return the tagged potential read access relation. */
__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
__isl_keep pet_scop *scop);
/* Return the potential write access relation. */
__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop);
/* Return the definite write access relation. */
__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop);
/* Return the tagged potential write access relation. */
__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
__isl_keep pet_scop *scop);
/* Return the tagged definite write access relation. */
__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
__isl_keep pet_scop *scop);
/* Return the definite kill access relation. */
__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop);
/* Return the tagged definite kill access relation. */
__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
__isl_keep pet_scop *scop);
/* Compute a mapping from all outermost arrays (of structs) in scop
* to their innermost members.
*/
__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
__isl_keep pet_scop *scop);
/* Compute a mapping from all outermost arrays (of structs) in scop
* to their members, including the outermost arrays themselves.
*/
__isl_give isl_union_map *pet_scop_compute_outer_to_any(
__isl_keep pet_scop *scop);
#if defined(__cplusplus)
}
#endif
#endif

View File

@ -1,29 +0,0 @@
version: 0.07
date: Tue Feb 7 17:23:22 CET 2017
changes:
- support hybrid tiling
---
version: 0.06
date: Fri May 6 12:08:50 CEST 2016
changes:
- use PPCG specific macro names in generated code
- complete transition to schedule trees
- maximize coincidence by default
- map arrays with constant index expressions to private memory
- optionally group chains of statements
---
version: 0.05
date: Fri Jan 15 09:30:23 CET 2016
changes:
- fix live-out computation
- optionally compute schedule for C target
- optionally perform tiling for C target
- create single kernel for non-permutable subtree
---
version: 0.04
date: Wed Jun 17 10:52:58 CEST 2015
changes:
- use schedule trees
- fix live-range reordering
- improve generation of synchronization
- exploit independences during dependence analysis

View File

@ -1 +0,0 @@
ppcg-0.07

View File

@ -1,246 +0,0 @@
Requirements:
- automake, autoconf, libtool
(not needed when compiling a release)
- pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config)
(not needed when compiling a release using the included isl and pet)
- gmp (http://gmplib.org/)
- libyaml (http://pyyaml.org/wiki/LibYAML)
(only needed if you want to compile the pet executable)
- LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html)
Unless you have some other reasons for wanting to use the svn version,
it is best to install the latest release (3.9).
For more details, see pet/README.
If you are installing on Ubuntu, then you can install the following packages:
automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm
Note that you need at least version 3.2 of libclang-dev (ubuntu raring).
Older versions of this package did not include the required libraries.
If you are using an older version of ubuntu, then you need to compile and
install LLVM/clang from source.
Preparing:
Grab the latest release and extract it or get the source from
the git repository as follows. This process requires autoconf,
automake, libtool and pkg-config.
git clone git://repo.or.cz/ppcg.git
cd ppcg
./get_submodules.sh
./autogen.sh
Compilation:
./configure
make
make check
If you have installed any of the required libraries in a non-standard
location, then you may need to use the --with-gmp-prefix,
--with-libyaml-prefix and/or --with-clang-prefix options
when calling "./configure".
Using PPCG to generate CUDA or OpenCL code
To convert a fragment of a C program to CUDA, insert a line containing
#pragma scop
before the fragment and add a line containing
#pragma endscop
after the fragment. To generate CUDA code run
ppcg --target=cuda file.c
where file.c is the file containing the fragment. The generated
code is stored in file_host.cu and file_kernel.cu.
To generate OpenCL code run
ppcg --target=opencl file.c
where file.c is the file containing the fragment. The generated code
is stored in file_host.c and file_kernel.cl.
Specifying tile, grid and block sizes
The iterations space tile size, grid size and block size can
be specified using the --sizes option. The argument is a union map
in isl notation mapping kernels identified by their sequence number
in a "kernel" space to singleton sets in the "tile", "grid" and "block"
spaces. The sizes are specified outermost to innermost.
The dimension of the "tile" space indicates the (maximal) number of loop
dimensions to tile. The elements of the single integer tuple
specify the tile sizes in each dimension.
In case of hybrid tiling, the first element is half the size of
the tile in the time (sequential) dimension. The second element
specifies the number of elements in the base of the hexagon.
The remaining elements specify the tile sizes in the remaining space
dimensions.
The dimension of the "grid" space indicates the (maximal) number of block
dimensions in the grid. The elements of the single integer tuple
specify the number of blocks in each dimension.
The dimension of the "block" space indicates the (maximal) number of thread
dimensions in the grid. The elements of the single integer tuple
specify the number of threads in each dimension.
For example,
{ kernel[0] -> tile[64,64]; kernel[i] -> block[16] : i != 4 }
specifies that in kernel 0, two loops should be tiled with a tile
size of 64 in both dimensions and that all kernels except kernel 4
should be run using a block of 16 threads.
Since PPCG performs some scheduling, it can be difficult to predict
what exactly will end up in a kernel. If you want to specify
tile, grid or block sizes, you may want to run PPCG first with the defaults,
examine the kernels and then run PPCG again with the desired sizes.
Instead of examining the kernels, you can also specify the option
--dump-sizes on the first run to obtain the effectively used default sizes.
Compiling the generated CUDA code with nvcc
To get optimal performance from nvcc, it is important to choose --arch
according to your target GPU. Specifically, use the flag "--arch sm_20"
for fermi, "--arch sm_30" for GK10x Kepler and "--arch sm_35" for
GK110 Kepler. We discourage the use of older cards as we have seen
correctness issues with compilation for older architectures.
Note that in the absence of any --arch flag, nvcc defaults to
"--arch sm_13". This will not only be slower, but can also cause
correctness issues.
If you want to obtain results that are identical to those obtained
by the original code, then you may need to disable some optimizations
by passing the "--fmad=false" option.
Compiling the generated OpenCL code with gcc
To compile the host code you need to link against the file
ocl_utilities.c which contains utility functions used by the generated
OpenCL host code. To compile the host code with gcc, run
gcc -std=c99 file_host.c ocl_utilities.c -lOpenCL
Note that we have experienced the generated OpenCL code freezing
on some inputs (e.g., the PolyBench symm benchmark) when using
at least some version of the Nvidia OpenCL library, while the
corresponding CUDA code runs fine.
We have experienced no such freezes when using AMD, ARM or Intel
OpenCL libraries.
By default, the compiled executable will need the _kernel.cl file at
run time. Alternatively, the option --opencl-embed-kernel-code may be
given to place the kernel code in a string literal. The kernel code is
then compiled into the host binary, such that the _kernel.cl file is no
longer needed at run time. Any kernel include files, in particular
those supplied using --opencl-include-file, will still be required at
run time.
Function calls
Function calls inside the analyzed fragment are reproduced
in the CUDA or OpenCL code, but for now it is left to the user
to make sure that the functions that are being called are
available from the generated kernels.
In the case of OpenCL code, the --opencl-include-file option
may be used to specify one or more files to be #include'd
from the generated code. These files may then contain
the definitions of the functions being called from the
program fragment. If the pathnames of the included files
are relative to the current directory, then you may need
to additionally specify the --opencl-compiler-options=-I.
to make sure that the files can be found by the OpenCL compiler.
The included files may contain definitions of types used by the
generated kernels. By default, PPCG generates definitions for
types as needed, but these definitions may collide with those in
the included files, as PPCG does not consider the contents of the
included files. The --no-opencl-print-kernel-types will prevent
PPCG from generating type definitions.
GNU extensions
By default, PPCG may print out macro definitions that involve
GNU extensions such as __typeof__ and statement expressions.
Some compilers may not support these extensions.
In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918)
has been reported not to support __typeof__.
The use of these extensions can be turned off with the
--no-allow-gnu-extensions option.
Processing PolyBench
When processing a PolyBench/C 3.2 benchmark, you should always specify
-DPOLYBENCH_USE_C99_PROTO on the ppcg command line. Otherwise, the source
files are inconsistent, having fixed size arrays but parametrically
bounded loops iterating over them.
However, you should not specify this define when compiling
the PPCG generated code using nvcc since CUDA does not support VLAs.
CUDA and function overloading
While CUDA supports function overloading based on the arguments types,
no such function overloading exists in the input language C. Since PPCG
simply prints out the same function name as in the original code, this
may result in a different function being called based on the types
of the arguments. For example, if the original code contains a call
to the function sqrt() with a float argument, then the argument will
be promoted to a double and the sqrt() function will be called.
In the transformed (CUDA) code, however, overloading will cause the
function sqrtf() to be called. Until this issue has been resolved in PPCG,
we recommend that users either explicitly call the function sqrtf() or
explicitly cast the argument to double in the input code.
Contact
For bug reports, feature requests and questions,
contact http://groups.google.com/group/isl-development
Whenever you report a bug, please mention the exact version of PPCG
that you are using (output of "./ppcg --version"). If you are unable
to compile PPCG, then report the git version (output of "git describe")
or the version number included in the name of the tarball.
Citing PPCG
If you use PPCG for your research, you are invited to cite
the following paper.
@article{Verdoolaege2013PPCG,
author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and
G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and
Catthoor, Francky},
title = {Polyhedral parallel code generation for CUDA},
journal = {ACM Trans. Archit. Code Optim.},
issue_date = {January 2013},
volume = {9},
number = {4},
month = jan,
year = {2013},
issn = {1544-3566},
pages = {54:1--54:23},
doi = {10.1145/2400682.2400713},
acmid = {2400713},
publisher = {ACM},
address = {New York, NY, USA},
}

View File

@ -1,802 +0,0 @@
/*
* Copyright 2012 INRIA Paris-Rocquencourt
* Copyright 2012 Ecole Normale Superieure
*
* Use of this software is governed by the MIT license
*
* Written by Tobias Grosser, INRIA Paris-Rocquencourt,
* Domaine de Voluceau, Rocquenqourt, B.P. 105,
* 78153 Le Chesnay Cedex France
* and Sven Verdoolaege,
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
*/
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <isl/aff.h>
#include <isl/ctx.h>
#include <isl/flow.h>
#include <isl/map.h>
#include <isl/ast_build.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include <pet.h>
#include "ppcg.h"
#include "ppcg_options.h"
#include "cpu.h"
#include "print.h"
#include "schedule.h"
#include "util.h"
/* Representation of a statement inside a generated AST.
*
* "stmt" refers to the original statement.
* "ref2expr" maps the reference identifier of each access in
* the statement to an AST expression that should be printed
* at the place of the access.
*/
struct ppcg_stmt {
struct pet_stmt *stmt;
isl_id_to_ast_expr *ref2expr;
};
static void ppcg_stmt_free(void *user)
{
struct ppcg_stmt *stmt = user;
if (!stmt)
return;
isl_id_to_ast_expr_free(stmt->ref2expr);
free(stmt);
}
/* Derive the output file name from the input file name.
* 'input' is the entire path of the input file. The output
* is the file name plus the additional extension.
*
* We will basically replace everything after the last point
* with '.ppcg.c'. This means file.c becomes file.ppcg.c
*/
static FILE *get_output_file(const char *input, const char *output)
{
char name[PATH_MAX];
const char *ext;
const char ppcg_marker[] = ".ppcg";
int len;
FILE *file;
len = ppcg_extract_base_name(name, input);
strcpy(name + len, ppcg_marker);
ext = strrchr(input, '.');
strcpy(name + len + sizeof(ppcg_marker) - 1, ext ? ext : ".c");
if (!output)
output = name;
file = fopen(output, "w");
if (!file) {
fprintf(stderr, "Unable to open '%s' for writing\n", output);
return NULL;
}
return file;
}
/* Data used to annotate for nodes in the ast.
*/
struct ast_node_userinfo {
/* The for node is an openmp parallel for node. */
int is_openmp;
};
/* Information used while building the ast.
*/
struct ast_build_userinfo {
/* The current ppcg scop. */
struct ppcg_scop *scop;
/* Are we currently in a parallel for loop? */
int in_parallel_for;
};
/* Check if the current scheduling dimension is parallel.
*
* We check for parallelism by verifying that the loop does not carry any
* dependences.
* If the live_range_reordering option is set, then this currently
* includes the order dependences. In principle, non-zero order dependences
* could be allowed, but this would require privatization and/or expansion.
*
* Parallelism test: if the distance is zero in all outer dimensions, then it
* has to be zero in the current dimension as well.
* Implementation: first, translate dependences into time space, then force
* outer dimensions to be equal. If the distance is zero in the current
* dimension, then the loop is parallel.
* The distance is zero in the current dimension if it is a subset of a map
* with equal values for the current dimension.
*/
static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build,
struct ppcg_scop *scop)
{
isl_union_map *schedule, *deps;
isl_map *schedule_deps, *test;
isl_space *schedule_space;
unsigned i, dimension, is_parallel;
schedule = isl_ast_build_get_schedule(build);
schedule_space = isl_ast_build_get_schedule_space(build);
dimension = isl_space_dim(schedule_space, isl_dim_out) - 1;
deps = isl_union_map_copy(scop->dep_flow);
deps = isl_union_map_union(deps, isl_union_map_copy(scop->dep_false));
if (scop->options->live_range_reordering) {
isl_union_map *order = isl_union_map_copy(scop->dep_order);
deps = isl_union_map_union(deps, order);
}
deps = isl_union_map_apply_range(deps, isl_union_map_copy(schedule));
deps = isl_union_map_apply_domain(deps, schedule);
if (isl_union_map_is_empty(deps)) {
isl_union_map_free(deps);
isl_space_free(schedule_space);
return 1;
}
schedule_deps = isl_map_from_union_map(deps);
for (i = 0; i < dimension; i++)
schedule_deps = isl_map_equate(schedule_deps, isl_dim_out, i,
isl_dim_in, i);
test = isl_map_universe(isl_map_get_space(schedule_deps));
test = isl_map_equate(test, isl_dim_out, dimension, isl_dim_in,
dimension);
is_parallel = isl_map_is_subset(schedule_deps, test);
isl_space_free(schedule_space);
isl_map_free(test);
isl_map_free(schedule_deps);
return is_parallel;
}
/* Mark a for node openmp parallel, if it is the outermost parallel for node.
*/
static void mark_openmp_parallel(__isl_keep isl_ast_build *build,
struct ast_build_userinfo *build_info,
struct ast_node_userinfo *node_info)
{
if (build_info->in_parallel_for)
return;
if (ast_schedule_dim_is_parallel(build, build_info->scop)) {
build_info->in_parallel_for = 1;
node_info->is_openmp = 1;
}
}
/* Allocate an ast_node_info structure and initialize it with default values.
*/
static struct ast_node_userinfo *allocate_ast_node_userinfo()
{
struct ast_node_userinfo *node_info;
node_info = (struct ast_node_userinfo *)
malloc(sizeof(struct ast_node_userinfo));
node_info->is_openmp = 0;
return node_info;
}
/* Free an ast_node_info structure.
*/
static void free_ast_node_userinfo(void *ptr)
{
struct ast_node_userinfo *info;
info = (struct ast_node_userinfo *) ptr;
free(info);
}
/* This method is executed before the construction of a for node. It creates
* an isl_id that is used to annotate the subsequently generated ast for nodes.
*
* In this function we also run the following analyses:
*
* - Detection of openmp parallel loops
*/
static __isl_give isl_id *ast_build_before_for(
__isl_keep isl_ast_build *build, void *user)
{
isl_id *id;
struct ast_build_userinfo *build_info;
struct ast_node_userinfo *node_info;
build_info = (struct ast_build_userinfo *) user;
node_info = allocate_ast_node_userinfo();
id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info);
id = isl_id_set_free_user(id, free_ast_node_userinfo);
mark_openmp_parallel(build, build_info, node_info);
return id;
}
/* This method is executed after the construction of a for node.
*
* It performs the following actions:
*
* - Reset the 'in_parallel_for' flag, as soon as we leave a for node,
* that is marked as openmp parallel.
*
*/
static __isl_give isl_ast_node *ast_build_after_for(
__isl_take isl_ast_node *node, __isl_keep isl_ast_build *build,
void *user)
{
isl_id *id;
struct ast_build_userinfo *build_info;
struct ast_node_userinfo *info;
id = isl_ast_node_get_annotation(node);
info = isl_id_get_user(id);
if (info && info->is_openmp) {
build_info = (struct ast_build_userinfo *) user;
build_info->in_parallel_for = 0;
}
isl_id_free(id);
return node;
}
/* Find the element in scop->stmts that has the given "id".
*/
static struct pet_stmt *find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id)
{
int i;
for (i = 0; i < scop->pet->n_stmt; ++i) {
struct pet_stmt *stmt = scop->pet->stmts[i];
isl_id *id_i;
id_i = isl_set_get_tuple_id(stmt->domain);
isl_id_free(id_i);
if (id_i == id)
return stmt;
}
isl_die(isl_id_get_ctx(id), isl_error_internal,
"statement not found", return NULL);
}
/* Print a user statement in the generated AST.
* The ppcg_stmt has been attached to the node in at_each_domain.
*/
static __isl_give isl_printer *print_user(__isl_take isl_printer *p,
__isl_take isl_ast_print_options *print_options,
__isl_keep isl_ast_node *node, void *user)
{
struct ppcg_stmt *stmt;
isl_id *id;
id = isl_ast_node_get_annotation(node);
stmt = isl_id_get_user(id);
isl_id_free(id);
p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr);
isl_ast_print_options_free(print_options);
return p;
}
/* Print a for loop node as an openmp parallel loop.
*
* To print an openmp parallel loop we print a normal for loop, but add
* "#pragma openmp parallel for" in front.
*
* Variables that are declared within the body of this for loop are
* automatically openmp 'private'. Iterators declared outside of the
* for loop are automatically openmp 'shared'. As ppcg declares all iterators
* at the position where they are assigned, there is no need to explicitly mark
* variables. Their automatically assigned type is already correct.
*
* This function only generates valid OpenMP code, if the ast was generated
* with the 'atomic-bounds' option enabled.
*
*/
static __isl_give isl_printer *print_for_with_openmp(
__isl_keep isl_ast_node *node, __isl_take isl_printer *p,
__isl_take isl_ast_print_options *print_options)
{
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "#pragma omp parallel for");
p = isl_printer_end_line(p);
p = isl_ast_node_for_print(node, p, print_options);
return p;
}
/* Print a for node.
*
* Depending on how the node is annotated, we either print a normal
* for node or an openmp parallel for node.
*/
static __isl_give isl_printer *print_for(__isl_take isl_printer *p,
__isl_take isl_ast_print_options *print_options,
__isl_keep isl_ast_node *node, void *user)
{
isl_id *id;
int openmp;
openmp = 0;
id = isl_ast_node_get_annotation(node);
if (id) {
struct ast_node_userinfo *info;
info = (struct ast_node_userinfo *) isl_id_get_user(id);
if (info && info->is_openmp)
openmp = 1;
}
if (openmp)
p = print_for_with_openmp(node, p, print_options);
else
p = isl_ast_node_for_print(node, p, print_options);
isl_id_free(id);
return p;
}
/* Index transformation callback for pet_stmt_build_ast_exprs.
*
* "index" expresses the array indices in terms of statement iterators
* "iterator_map" expresses the statement iterators in terms of
* AST loop iterators.
*
* The result expresses the array indices in terms of
* AST loop iterators.
*/
static __isl_give isl_multi_pw_aff *pullback_index(
__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user)
{
isl_pw_multi_aff *iterator_map = user;
iterator_map = isl_pw_multi_aff_copy(iterator_map);
return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
}
/* Transform the accesses in the statement associated to the domain
* called by "node" to refer to the AST loop iterators, construct
* corresponding AST expressions using "build",
* collect them in a ppcg_stmt and annotate the node with the ppcg_stmt.
*/
static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
__isl_keep isl_ast_build *build, void *user)
{
struct ppcg_scop *scop = user;
isl_ast_expr *expr, *arg;
isl_ctx *ctx;
isl_id *id;
isl_map *map;
isl_pw_multi_aff *iterator_map;
struct ppcg_stmt *stmt;
ctx = isl_ast_node_get_ctx(node);
stmt = isl_calloc_type(ctx, struct ppcg_stmt);
if (!stmt)
goto error;
expr = isl_ast_node_user_get_expr(node);
arg = isl_ast_expr_get_op_arg(expr, 0);
isl_ast_expr_free(expr);
id = isl_ast_expr_get_id(arg);
isl_ast_expr_free(arg);
stmt->stmt = find_stmt(scop, id);
isl_id_free(id);
if (!stmt->stmt)
goto error;
map = isl_map_from_union_map(isl_ast_build_get_schedule(build));
map = isl_map_reverse(map);
iterator_map = isl_pw_multi_aff_from_map(map);
stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build,
&pullback_index, iterator_map, NULL, NULL);
isl_pw_multi_aff_free(iterator_map);
id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt);
id = isl_id_set_free_user(id, &ppcg_stmt_free);
return isl_ast_node_set_annotation(node, id);
error:
ppcg_stmt_free(stmt);
return isl_ast_node_free(node);
}
/* Set *depth (initialized to 0 by the caller) to the maximum
* of the schedule depths of the leaf nodes for which this function is called.
*/
static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
{
int *depth = user;
int node_depth;
if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
return isl_bool_true;
node_depth = isl_schedule_node_get_schedule_depth(node);
if (node_depth > *depth)
*depth = node_depth;
return isl_bool_false;
}
/* This function is called for each node in a CPU AST.
* In case of a user node, print the macro definitions required
* for printing the AST expressions in the annotation, if any.
* For other nodes, return true such that descendants are also
* visited.
*
* In particular, print the macro definitions needed for the substitutions
* of the original user statements.
*/
static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
{
struct ppcg_stmt *stmt;
isl_id *id;
isl_printer **p = user;
if (isl_ast_node_get_type(node) != isl_ast_node_user)
return isl_bool_true;
id = isl_ast_node_get_annotation(node);
stmt = isl_id_get_user(id);
isl_id_free(id);
if (!stmt)
return isl_bool_error;
*p = ppcg_print_body_macros(*p, stmt->ref2expr);
if (!*p)
return isl_bool_error;
return isl_bool_false;
}
/* Print the required macros for the CPU AST "node" to "p",
* including those needed for the user statements inside the AST.
*/
static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p,
__isl_keep isl_ast_node *node)
{
if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
return isl_printer_free(p);
p = ppcg_print_macros(p, node);
return p;
}
/* Code generate the scop 'scop' using "schedule"
* and print the corresponding C code to 'p'.
*/
static __isl_give isl_printer *print_scop(struct ppcg_scop *scop,
__isl_take isl_schedule *schedule, __isl_take isl_printer *p,
struct ppcg_options *options)
{
isl_ctx *ctx = isl_printer_get_ctx(p);
isl_ast_build *build;
isl_ast_print_options *print_options;
isl_ast_node *tree;
isl_id_list *iterators;
struct ast_build_userinfo build_info;
int depth;
depth = 0;
if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
&depth) < 0)
goto error;
build = isl_ast_build_alloc(ctx);
iterators = ppcg_scop_generate_names(scop, depth, "c");
build = isl_ast_build_set_iterators(build, iterators);
build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop);
if (options->openmp) {
build_info.scop = scop;
build_info.in_parallel_for = 0;
build = isl_ast_build_set_before_each_for(build,
&ast_build_before_for,
&build_info);
build = isl_ast_build_set_after_each_for(build,
&ast_build_after_for,
&build_info);
}
tree = isl_ast_build_node_from_schedule(build, schedule);
isl_ast_build_free(build);
print_options = isl_ast_print_options_alloc(ctx);
print_options = isl_ast_print_options_set_print_user(print_options,
&print_user, NULL);
print_options = isl_ast_print_options_set_print_for(print_options,
&print_for, NULL);
p = cpu_print_macros(p, tree);
p = isl_ast_node_print(tree, p, print_options);
isl_ast_node_free(tree);
return p;
error:
isl_schedule_free(schedule);
isl_printer_free(p);
return NULL;
}
/* Tile the band node "node" with tile sizes "sizes" and
* mark all members of the resulting tile node as "atomic".
*/
static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node,
__isl_take isl_multi_val *sizes)
{
node = isl_schedule_node_band_tile(node, sizes);
node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
return node;
}
/* Tile "node", if it is a band node with at least 2 members.
* The tile sizes are set from the "tile_size" option.
*/
static __isl_give isl_schedule_node *tile_band(
__isl_take isl_schedule_node *node, void *user)
{
struct ppcg_scop *scop = user;
int n;
isl_space *space;
isl_multi_val *sizes;
if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
return node;
n = isl_schedule_node_band_n_member(node);
if (n <= 1)
return node;
space = isl_schedule_node_band_get_space(node);
sizes = ppcg_multi_val_from_int(space, scop->options->tile_size);
return tile(node, sizes);
}
/* Construct schedule constraints from the dependences in ps
* for the purpose of computing a schedule for a CPU.
*
* The proximity constraints are set to the flow dependences.
*
* If live-range reordering is allowed then the conditional validity
* constraints are set to the order dependences with the flow dependences
* as condition. That is, a live-range (flow dependence) will be either
* local to an iteration of a band or all adjacent order dependences
* will be respected by the band.
* The validity constraints are set to the union of the flow dependences
* and the forced dependences, while the coincidence constraints
* are set to the union of the flow dependences, the forced dependences and
* the order dependences.
*
* If live-range reordering is not allowed, then both the validity
* and the coincidence constraints are set to the union of the flow
* dependences and the false dependences.
*
* Note that the coincidence constraints are only set when the "openmp"
* options is set. Even though the way openmp pragmas are introduced
* does not rely on the coincident property of the schedule band members,
* the coincidence constraints do affect the way the schedule is constructed,
* such that more schedule dimensions should be detected as parallel
* by ast_schedule_dim_is_parallel.
* Since the order dependences are also taken into account by
* ast_schedule_dim_is_parallel, they are also added to
* the coincidence constraints. If the openmp handling learns
* how to privatize some memory, then the corresponding order
* dependences can be removed from the coincidence constraints.
*/
static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints(
struct ppcg_scop *ps)
{
isl_schedule_constraints *sc;
isl_union_map *validity, *coincidence;
sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain));
if (ps->options->live_range_reordering) {
sc = isl_schedule_constraints_set_conditional_validity(sc,
isl_union_map_copy(ps->tagged_dep_flow),
isl_union_map_copy(ps->tagged_dep_order));
validity = isl_union_map_copy(ps->dep_flow);
validity = isl_union_map_union(validity,
isl_union_map_copy(ps->dep_forced));
if (ps->options->openmp) {
coincidence = isl_union_map_copy(validity);
coincidence = isl_union_map_union(coincidence,
isl_union_map_copy(ps->dep_order));
}
} else {
validity = isl_union_map_copy(ps->dep_flow);
validity = isl_union_map_union(validity,
isl_union_map_copy(ps->dep_false));
if (ps->options->openmp)
coincidence = isl_union_map_copy(validity);
}
if (ps->options->openmp)
sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
sc = isl_schedule_constraints_set_validity(sc, validity);
sc = isl_schedule_constraints_set_proximity(sc,
isl_union_map_copy(ps->dep_flow));
return sc;
}
/* Compute a schedule for the scop "ps".
*
* First derive the appropriate schedule constraints from the dependences
* in "ps" and then compute a schedule from those schedule constraints,
* possibly grouping statement instances based on the input schedule.
*/
static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps)
{
isl_schedule_constraints *sc;
isl_schedule *schedule;
if (!ps)
return NULL;
sc = construct_cpu_schedule_constraints(ps);
if (ps->options->debug->dump_schedule_constraints)
isl_schedule_constraints_dump(sc);
schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options);
return schedule;
}
/* Compute a new schedule to the scop "ps" if the reschedule option is set.
* Otherwise, return a copy of the original schedule.
*/
static __isl_give isl_schedule *optionally_compute_schedule(void *user)
{
struct ppcg_scop *ps = user;
if (!ps)
return NULL;
if (!ps->options->reschedule)
return isl_schedule_copy(ps->schedule);
return compute_cpu_schedule(ps);
}
/* Compute a schedule based on the dependences in "ps" and
* tile it if requested by the user.
*/
static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps,
struct ppcg_options *options)
{
isl_ctx *ctx;
isl_schedule *schedule;
if (!ps)
return NULL;
ctx = isl_union_set_get_ctx(ps->domain);
schedule = ppcg_get_schedule(ctx, options,
&optionally_compute_schedule, ps);
if (ps->options->tile)
schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
&tile_band, ps);
return schedule;
}
/* Generate CPU code for the scop "ps" using "schedule" and
* print the corresponding C code to "p", including variable declarations.
*/
static __isl_give isl_printer *print_cpu_with_schedule(
__isl_take isl_printer *p, struct ppcg_scop *ps,
__isl_take isl_schedule *schedule, struct ppcg_options *options)
{
int hidden;
isl_set *context;
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "/* ppcg generated CPU code */");
p = isl_printer_end_line(p);
p = isl_printer_start_line(p);
p = isl_printer_end_line(p);
p = ppcg_set_macro_names(p);
p = ppcg_print_exposed_declarations(p, ps);
hidden = ppcg_scop_any_hidden_declarations(ps);
if (hidden) {
p = ppcg_start_block(p);
p = ppcg_print_hidden_declarations(p, ps);
}
context = isl_set_copy(ps->context);
context = isl_set_from_params(context);
schedule = isl_schedule_insert_context(schedule, context);
if (options->debug->dump_final_schedule)
isl_schedule_dump(schedule);
p = print_scop(ps, schedule, p, options);
if (hidden)
p = ppcg_end_block(p);
return p;
}
/* Generate CPU code for the scop "ps" and print the corresponding C code
* to "p", including variable declarations.
*/
__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
struct ppcg_scop *ps, struct ppcg_options *options)
{
isl_schedule *schedule;
schedule = isl_schedule_copy(ps->schedule);
return print_cpu_with_schedule(p, ps, schedule, options);
}
/* Generate CPU code for "scop" and print it to "p".
*
* First obtain a schedule for "scop" and then print code for "scop"
* using that schedule.
*/
static __isl_give isl_printer *generate(__isl_take isl_printer *p,
struct ppcg_scop *scop, struct ppcg_options *options)
{
isl_schedule *schedule;
schedule = get_schedule(scop, options);
return print_cpu_with_schedule(p, scop, schedule, options);
}
/* Wrapper around generate for use as a ppcg_transform callback.
*/
static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p,
struct ppcg_scop *scop, void *user)
{
struct ppcg_options *options = user;
return generate(p, scop, options);
}
/* Transform the code in the file called "input" by replacing
* all scops by corresponding CPU code and write the results to a file
* called "output".
*/
int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
const char *input, const char *output)
{
FILE *output_file;
int r;
output_file = get_output_file(input, output);
if (!output_file)
return -1;
r = ppcg_transform(ctx, input, output_file, options,
&print_cpu_wrap, options);
fclose(output_file);
return r;
}

View File

@ -1,15 +0,0 @@
#ifndef _CPU_H
#define _CPU_H
#include <isl/ctx.h>
#include "ppcg.h"
struct ppcg_options;
__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
struct ppcg_scop *ps, struct ppcg_options *options);
int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
const char *input, const char *output);
#endif

View File

@ -1,730 +0,0 @@
/*
* Copyright 2012 Ecole Normale Superieure
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege,
* Ecole Normale Superieure, 45 rue dUlm, 75230 Paris, France
*/
#include <isl/aff.h>
#include <isl/ast.h>
#include "cuda_common.h"
#include "cuda.h"
#include "gpu.h"
#include "gpu_print.h"
#include "print.h"
#include "util.h"
static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
{
const char *macros =
"#define cudaCheckReturn(ret) \\\n"
" do { \\\n"
" cudaError_t cudaCheckReturn_e = (ret); \\\n"
" if (cudaCheckReturn_e != cudaSuccess) { \\\n"
" fprintf(stderr, \"CUDA error: %s\\n\", "
"cudaGetErrorString(cudaCheckReturn_e)); \\\n"
" fflush(stderr); \\\n"
" } \\\n"
" assert(cudaCheckReturn_e == cudaSuccess); \\\n"
" } while(0)\n"
"#define cudaCheckKernel() \\\n"
" do { \\\n"
" cudaCheckReturn(cudaGetLastError()); \\\n"
" } while(0)\n\n";
p = isl_printer_print_str(p, macros);
return p;
}
/* Print a declaration for the device array corresponding to "array" on "p".
*/
static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
struct gpu_array_info *array)
{
int i;
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, array->type);
p = isl_printer_print_str(p, " ");
if (!array->linearize && array->n_index > 1)
p = isl_printer_print_str(p, "(");
p = isl_printer_print_str(p, "*dev_");
p = isl_printer_print_str(p, array->name);
if (!array->linearize && array->n_index > 1) {
p = isl_printer_print_str(p, ")");
for (i = 1; i < array->n_index; i++) {
isl_ast_expr *bound;
bound = isl_ast_expr_get_op_arg(array->bound_expr,
1 + i);
p = isl_printer_print_str(p, "[");
p = isl_printer_print_ast_expr(p, bound);
p = isl_printer_print_str(p, "]");
isl_ast_expr_free(bound);
}
}
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
return p;
}
static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
struct gpu_prog *prog)
{
int i;
for (i = 0; i < prog->n_array; ++i) {
if (!gpu_array_requires_device_allocation(&prog->array[i]))
continue;
p = declare_device_array(p, &prog->array[i]);
}
p = isl_printer_start_line(p);
p = isl_printer_end_line(p);
return p;
}
static __isl_give isl_printer *allocate_device_arrays(
__isl_take isl_printer *p, struct gpu_prog *prog)
{
int i;
for (i = 0; i < prog->n_array; ++i) {
struct gpu_array_info *array = &prog->array[i];
if (!gpu_array_requires_device_allocation(&prog->array[i]))
continue;
p = ppcg_ast_expr_print_macros(array->bound_expr, p);
p = isl_printer_start_line(p);
p = isl_printer_print_str(p,
"cudaCheckReturn(cudaMalloc((void **) &dev_");
p = isl_printer_print_str(p, prog->array[i].name);
p = isl_printer_print_str(p, ", ");
p = gpu_array_info_print_size(p, &prog->array[i]);
p = isl_printer_print_str(p, "));");
p = isl_printer_end_line(p);
}
p = isl_printer_start_line(p);
p = isl_printer_end_line(p);
return p;
}
static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
struct gpu_prog *prog)
{
int i;
for (i = 0; i < prog->n_array; ++i) {
if (!gpu_array_requires_device_allocation(&prog->array[i]))
continue;
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
p = isl_printer_print_str(p, prog->array[i].name);
p = isl_printer_print_str(p, "));");
p = isl_printer_end_line(p);
}
return p;
}
/* Print code to "p" for copying "array" from the host to the device
* in its entirety. The bounds on the extent of "array" have
* been precomputed in extract_array_info and are used in
* gpu_array_info_print_size.
*/
static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
struct gpu_array_info *array)
{
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
p = isl_printer_print_str(p, array->name);
p = isl_printer_print_str(p, ", ");
if (gpu_array_is_scalar(array))
p = isl_printer_print_str(p, "&");
p = isl_printer_print_str(p, array->name);
p = isl_printer_print_str(p, ", ");
p = gpu_array_info_print_size(p, array);
p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
p = isl_printer_end_line(p);
return p;
}
/* Print code to "p" for copying "array" back from the device to the host
* in its entirety. The bounds on the extent of "array" have
* been precomputed in extract_array_info and are used in
* gpu_array_info_print_size.
*/
static __isl_give isl_printer *copy_array_from_device(
__isl_take isl_printer *p, struct gpu_array_info *array)
{
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
if (gpu_array_is_scalar(array))
p = isl_printer_print_str(p, "&");
p = isl_printer_print_str(p, array->name);
p = isl_printer_print_str(p, ", dev_");
p = isl_printer_print_str(p, array->name);
p = isl_printer_print_str(p, ", ");
p = gpu_array_info_print_size(p, array);
p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
p = isl_printer_end_line(p);
return p;
}
static __isl_give isl_printer* print_reverse_list(__isl_take isl_printer *p, int len, int *list)
{
int i;
if (len == 0)
return p;
p = isl_printer_print_str(p, "(");
for (i = 0; i < len; ++i) {
if (i)
p = isl_printer_print_str(p, ", ");
p = isl_printer_print_int(p, list[len - 1 - i]);
}
return isl_printer_print_str(p, ")");
}
/* Print the effective grid size as a list of the sizes in each
* dimension, from innermost to outermost.
*/
static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
struct ppcg_kernel *kernel)
{
int i;
int dim;
dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
if (dim == 0)
return p;
p = isl_printer_print_str(p, "(");
for (i = dim - 1; i >= 0; --i) {
isl_ast_expr *bound;
bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
p = isl_printer_print_ast_expr(p, bound);
isl_ast_expr_free(bound);
if (i > 0)
p = isl_printer_print_str(p, ", ");
}
p = isl_printer_print_str(p, ")");
return p;
}
/* Print the grid definition.
*/
static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
struct ppcg_kernel *kernel)
{
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "dim3 k");
p = isl_printer_print_int(p, kernel->id);
p = isl_printer_print_str(p, "_dimGrid");
p = print_grid_size(p, kernel);
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
return p;
}
/* Print the arguments to a kernel declaration or call. If "types" is set,
* then print a declaration (including the types of the arguments).
*
* The arguments are printed in the following order
* - the arrays accessed by the kernel
* - the parameters
* - the host loop iterators
*/
static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
{
int i, n;
int first = 1;
unsigned nparam;
isl_space *space;
const char *type;
for (i = 0; i < prog->n_array; ++i) {
int required;
required = ppcg_kernel_requires_array_argument(kernel, i);
if (required < 0)
return isl_printer_free(p);
if (!required)
continue;
if (!first)
p = isl_printer_print_str(p, ", ");
if (types)
p = gpu_array_info_print_declaration_argument(p,
&prog->array[i], NULL);
else
p = gpu_array_info_print_call_argument(p,
&prog->array[i]);
first = 0;
}
space = isl_union_set_get_space(kernel->arrays);
nparam = isl_space_dim(space, isl_dim_param);
for (i = 0; i < nparam; ++i) {
const char *name;
name = isl_space_get_dim_name(space, isl_dim_param, i);
if (!first)
p = isl_printer_print_str(p, ", ");
if (types)
p = isl_printer_print_str(p, "int ");
p = isl_printer_print_str(p, name);
first = 0;
}
isl_space_free(space);
n = isl_space_dim(kernel->space, isl_dim_set);
type = isl_options_get_ast_iterator_type(prog->ctx);
for (i = 0; i < n; ++i) {
const char *name;
if (!first)
p = isl_printer_print_str(p, ", ");
name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
if (types) {
p = isl_printer_print_str(p, type);
p = isl_printer_print_str(p, " ");
}
p = isl_printer_print_str(p, name);
first = 0;
}
return p;
}
/* Print the header of the given kernel.
*/
static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
struct gpu_prog *prog, struct ppcg_kernel *kernel)
{
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "__global__ void kernel");
p = isl_printer_print_int(p, kernel->id);
p = isl_printer_print_str(p, "(");
p = print_kernel_arguments(p, prog, kernel, 1);
p = isl_printer_print_str(p, ")");
return p;
}
/* Print the header of the given kernel to both gen->cuda.kernel_h
* and gen->cuda.kernel_c.
*/
static void print_kernel_headers(struct gpu_prog *prog,
struct ppcg_kernel *kernel, struct cuda_info *cuda)
{
isl_printer *p;
p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
p = isl_printer_set_output_format(p, ISL_FORMAT_C);
p = print_kernel_header(p, prog, kernel);
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
isl_printer_free(p);
p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
p = isl_printer_set_output_format(p, ISL_FORMAT_C);
p = print_kernel_header(p, prog, kernel);
p = isl_printer_end_line(p);
isl_printer_free(p);
}
static void print_indent(FILE *dst, int indent)
{
fprintf(dst, "%*s", indent, "");
}
/* Print a list of iterators of type "type" with names "ids" to "out".
* Each iterator is assigned one of the cuda identifiers in cuda_dims.
* In particular, the last iterator is assigned the x identifier
* (the first in the list of cuda identifiers).
*/
static void print_iterators(FILE *out, const char *type,
__isl_keep isl_id_list *ids, const char *cuda_dims[])
{
int i, n;
n = isl_id_list_n_id(ids);
if (n <= 0)
return;
print_indent(out, 4);
fprintf(out, "%s ", type);
for (i = 0; i < n; ++i) {
isl_id *id;
if (i)
fprintf(out, ", ");
id = isl_id_list_get_id(ids, i);
fprintf(out, "%s = %s", isl_id_get_name(id),
cuda_dims[n - 1 - i]);
isl_id_free(id);
}
fprintf(out, ";\n");
}
static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
{
isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
const char *type;
const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
"threadIdx.z" };
type = isl_options_get_ast_iterator_type(ctx);
print_iterators(out, type, kernel->block_ids, block_dims);
print_iterators(out, type, kernel->thread_ids, thread_dims);
}
static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
struct ppcg_kernel_var *var)
{
int j;
p = isl_printer_start_line(p);
if (var->type == ppcg_access_shared)
p = isl_printer_print_str(p, "__shared__ ");
p = isl_printer_print_str(p, var->array->type);
p = isl_printer_print_str(p, " ");
p = isl_printer_print_str(p, var->name);
for (j = 0; j < var->array->n_index; ++j) {
isl_val *v;
p = isl_printer_print_str(p, "[");
v = isl_vec_get_element_val(var->size, j);
p = isl_printer_print_val(p, v);
isl_val_free(v);
p = isl_printer_print_str(p, "]");
}
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
return p;
}
static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
struct ppcg_kernel *kernel)
{
int i;
for (i = 0; i < kernel->n_var; ++i)
p = print_kernel_var(p, &kernel->var[i]);
return p;
}
/* Print a sync statement.
*/
static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
struct ppcg_kernel_stmt *stmt)
{
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "__syncthreads();");
p = isl_printer_end_line(p);
return p;
}
/* This function is called for each user statement in the AST,
* i.e., for each kernel body statement, copy statement or sync statement.
*/
static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
__isl_take isl_ast_print_options *print_options,
__isl_keep isl_ast_node *node, void *user)
{
isl_id *id;
struct ppcg_kernel_stmt *stmt;
id = isl_ast_node_get_annotation(node);
stmt = isl_id_get_user(id);
isl_id_free(id);
isl_ast_print_options_free(print_options);
switch (stmt->type) {
case ppcg_kernel_copy:
return ppcg_kernel_print_copy(p, stmt);
case ppcg_kernel_sync:
return print_sync(p, stmt);
case ppcg_kernel_domain:
return ppcg_kernel_print_domain(p, stmt);
}
return p;
}
static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
struct cuda_info *cuda)
{
isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
isl_ast_print_options *print_options;
isl_printer *p;
print_kernel_headers(prog, kernel, cuda);
fprintf(cuda->kernel_c, "{\n");
print_kernel_iterators(cuda->kernel_c, kernel);
p = isl_printer_to_file(ctx, cuda->kernel_c);
p = isl_printer_set_output_format(p, ISL_FORMAT_C);
p = isl_printer_indent(p, 4);
p = print_kernel_vars(p, kernel);
p = isl_printer_end_line(p);
p = ppcg_set_macro_names(p);
p = gpu_print_macros(p, kernel->tree);
print_options = isl_ast_print_options_alloc(ctx);
print_options = isl_ast_print_options_set_print_user(print_options,
&print_kernel_stmt, NULL);
p = isl_ast_node_print(kernel->tree, p, print_options);
isl_printer_free(p);
fprintf(cuda->kernel_c, "}\n");
}
/* Print code for initializing the device for execution of the transformed
* code. This includes declaring locally defined variables as well as
* declaring and allocating the required copies of arrays on the device.
*/
static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
struct gpu_prog *prog)
{
p = print_cuda_macros(p);
p = gpu_print_local_declarations(p, prog);
p = declare_device_arrays(p, prog);
p = allocate_device_arrays(p, prog);
return p;
}
/* Print code for clearing the device after execution of the transformed code.
* In particular, free the memory that was allocated on the device.
*/
static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
struct gpu_prog *prog)
{
p = free_device_arrays(p, prog);
return p;
}
/* Print a statement for copying an array to or from the device,
* or for initializing or clearing the device.
* The statement identifier of a copying node is called
* "to_device_<array name>" or "from_device_<array name>" and
* its user pointer points to the gpu_array_info of the array
* that needs to be copied.
* The node for initializing the device is called "init_device".
* The node for clearing the device is called "clear_device".
*
* Extract the array (if any) from the identifier and call
* init_device, clear_device, copy_array_to_device or copy_array_from_device.
*/
static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
__isl_keep isl_ast_node *node, struct gpu_prog *prog)
{
isl_ast_expr *expr, *arg;
isl_id *id;
const char *name;
struct gpu_array_info *array;
expr = isl_ast_node_user_get_expr(node);
arg = isl_ast_expr_get_op_arg(expr, 0);
id = isl_ast_expr_get_id(arg);
name = isl_id_get_name(id);
array = isl_id_get_user(id);
isl_id_free(id);
isl_ast_expr_free(arg);
isl_ast_expr_free(expr);
if (!name)
return isl_printer_free(p);
if (!strcmp(name, "init_device"))
return init_device(p, prog);
if (!strcmp(name, "clear_device"))
return clear_device(p, prog);
if (!array)
return isl_printer_free(p);
if (!prefixcmp(name, "to_device"))
return copy_array_to_device(p, array);
else
return copy_array_from_device(p, array);
}
struct print_host_user_data {
struct cuda_info *cuda;
struct gpu_prog *prog;
};
/* Print the user statement of the host code to "p".
*
* The host code may contain original user statements, kernel launches,
* statements that copy data to/from the device and statements
* the initialize or clear the device.
* The original user statements and the kernel launches have
* an associated annotation, while the other statements do not.
* The latter are handled by print_device_node.
* The annotation on the user statements is called "user".
*
* In case of a kernel launch, print a block of statements that
* defines the grid and the block and then launches the kernel.
*/
__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
__isl_take isl_ast_print_options *print_options,
__isl_keep isl_ast_node *node, void *user)
{
isl_id *id;
int is_user;
struct ppcg_kernel *kernel;
struct ppcg_kernel_stmt *stmt;
struct print_host_user_data *data;
isl_ast_print_options_free(print_options);
data = (struct print_host_user_data *) user;
id = isl_ast_node_get_annotation(node);
if (!id)
return print_device_node(p, node, data->prog);
is_user = !strcmp(isl_id_get_name(id), "user");
kernel = is_user ? NULL : isl_id_get_user(id);
stmt = is_user ? isl_id_get_user(id) : NULL;
isl_id_free(id);
if (is_user)
return ppcg_kernel_print_domain(p, stmt);
p = ppcg_start_block(p);
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "dim3 k");
p = isl_printer_print_int(p, kernel->id);
p = isl_printer_print_str(p, "_dimBlock");
p = print_reverse_list(p, kernel->n_block, kernel->block_dim);
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
p = print_grid(p, kernel);
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "kernel");
p = isl_printer_print_int(p, kernel->id);
p = isl_printer_print_str(p, " <<<k");
p = isl_printer_print_int(p, kernel->id);
p = isl_printer_print_str(p, "_dimGrid, k");
p = isl_printer_print_int(p, kernel->id);
p = isl_printer_print_str(p, "_dimBlock>>> (");
p = print_kernel_arguments(p, data->prog, kernel, 0);
p = isl_printer_print_str(p, ");");
p = isl_printer_end_line(p);
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "cudaCheckKernel();");
p = isl_printer_end_line(p);
p = ppcg_end_block(p);
p = isl_printer_start_line(p);
p = isl_printer_end_line(p);
#if 0
print_kernel(data->prog, kernel, data->cuda);
#endif
return p;
}
static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
struct cuda_info *cuda)
{
isl_ast_print_options *print_options;
isl_ctx *ctx = isl_ast_node_get_ctx(tree);
struct print_host_user_data data = { cuda, prog };
print_options = isl_ast_print_options_alloc(ctx);
print_options = isl_ast_print_options_set_print_user(print_options,
&print_host_user, &data);
p = gpu_print_macros(p, tree);
p = isl_ast_node_print(tree, p, print_options);
return p;
}
/* Given a gpu_prog "prog" and the corresponding transformed AST
* "tree", print the entire CUDA code to "p".
* "types" collects the types for which a definition has already
* been printed.
*/
static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
struct gpu_types *types, void *user)
{
struct cuda_info *cuda = user;
isl_printer *kernel;
kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
kernel = gpu_print_types(kernel, types, prog);
isl_printer_free(kernel);
if (!kernel)
return isl_printer_free(p);
p = print_host_code(p, prog, tree, cuda);
return p;
}
/* Transform the code in the file called "input" by replacing
* all scops by corresponding CUDA code.
* The names of the output files are derived from "input".
*
* We let generate_gpu do all the hard work and then let it call
* us back for printing the AST in print_cuda.
*
* To prepare for this printing, we first open the output files
* and we close them after generate_gpu has finished.
*/
int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
const char *input)
{
struct cuda_info cuda;
int r;
cuda_open_files(&cuda, input);
r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
cuda_close_files(&cuda);
return r;
}

View File

@ -1,13 +0,0 @@
#ifndef _CUDA_H
#define _CUDA_H
#include "ppcg_options.h"
#include "ppcg.h"
int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
const char *input);
__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
__isl_take isl_ast_print_options *print_options,
__isl_keep isl_ast_node *node, void *user);
#endif

View File

@ -1,50 +0,0 @@
/*
* Copyright 2010 INRIA Saclay
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
* Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
* 91893 Orsay, France
*/
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include "cuda_common.h"
#include "ppcg.h"
/* Open the host .cu file and the kernel .hu and .cu files for writing.
* Add the necessary includes.
*/
void cuda_open_files(struct cuda_info *info, const char *input)
{
char name[PATH_MAX];
int len;
len = ppcg_extract_base_name(name, input);
strcpy(name + len, "_host.cu");
info->host_c = fopen(name, "w");
strcpy(name + len, "_kernel.cu");
info->kernel_c = fopen(name, "w");
strcpy(name + len, "_kernel.hu");
info->kernel_h = fopen(name, "w");
fprintf(info->host_c, "#include <assert.h>\n");
fprintf(info->host_c, "#include <stdio.h>\n");
fprintf(info->host_c, "#include \"%s\"\n", name);
fprintf(info->kernel_c, "#include \"%s\"\n", name);
fprintf(info->kernel_h, "#include \"cuda.h\"\n\n");
}
/* Close all output files.
*/
void cuda_close_files(struct cuda_info *info)
{
fclose(info->kernel_c);
fclose(info->kernel_h);
fclose(info->host_c);
}

View File

@ -1,15 +0,0 @@
#ifndef _CUDA_COMMON_H_
#define _CUDA_COMMON_H_
#include <stdio.h>
struct cuda_info {
FILE *host_c;
FILE *kernel_c;
FILE *kernel_h;
};
void cuda_open_files(struct cuda_info *info, const char *input);
void cuda_close_files(struct cuda_info *info);
#endif

View File

@ -1,192 +0,0 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <pet.h>
#include "cpu.h"
#include "opencl.h"
#define die() { \
fprintf(stderr, "Dummy function %s called\n", __FUNCTION__); \
abort(); \
}
__isl_give isl_union_map *pet_scop_compute_outer_to_any(
__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
__isl_keep pet_scop *scop) {
die();
}
enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree) {
die();
}
int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
die();
}
isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr) {
die();
}
isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr) {
die();
}
isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr) {
die();
}
__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
__isl_keep pet_expr *expr) {
die();
}
__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
__isl_keep pet_expr *expr) {
die();
}
__isl_give isl_union_map *pet_expr_access_get_must_write(
__isl_keep pet_expr *expr) {
die();
}
__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
__isl_keep pet_expr *expr) {
die();
}
__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr) {
die();
}
__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
struct ppcg_scop *ps, struct ppcg_options *options) {
die();
}
__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
__isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr) {
die();
}
unsigned pet_loc_get_start(__isl_keep pet_loc *loc) {
die();
}
unsigned pet_loc_get_end(__isl_keep pet_loc *loc) {
die();
}
int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
__isl_take pet_scop *scop, void *user), void *user) {
die();
}
__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
__isl_take isl_printer *p) {
die();
}
__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop) {
die();
}
__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop) {
die();
}
int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop) {
die();
}
int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop) {
die();
}
int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
die();
}
int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
die();
}
int pet_stmt_is_kill(struct pet_stmt *stmt) {
die();
}
struct isl_args pet_options_args;
const char *ppcg_version(void) {
die();
}
int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val) {
die();
}
int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
const char *input, const char *output) {
die();
}
int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
const char *input, const char *output) {
die();
}
__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
__isl_keep isl_ast_build *build,
__isl_give isl_multi_pw_aff *(*fn_index)(
__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
void *user), void *user_index,
__isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
__isl_keep isl_id *id, void *user), void *user_expr) {
die();
}
__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop) {
die();
}
__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
__isl_keep pet_scop *scop) {
die();
}
__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr) {
die();
}
__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
__isl_keep const char *name) {
die();
}
__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos) {
die();
}
__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
__isl_take pet_expr *arg) {
die();
}
__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
__isl_take pet_expr *arg) {
die();
}
__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree) {
die();
}
__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree) {
die();
}
__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
__isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
void *user) {
die();
}
__isl_give isl_union_map *pet_expr_access_get_may_read(
__isl_keep pet_expr *expr) {
die();
}
__isl_give isl_union_map *pet_expr_access_get_may_write(
__isl_keep pet_expr *expr) {
die();
}

File diff suppressed because it is too large Load Diff

View File

@ -1,459 +0,0 @@
#ifndef _GPU_H
#define _GPU_H
#include <isl/ast.h>
#include <isl/id.h>
#include <isl/id_to_ast_expr.h>
#include <pet.h>
#include "ppcg.h"
#include "ppcg_options.h"
/* An access to an outer array element or an iterator.
* Accesses to iterators have an access relation that maps to an unnamed space.
* An access may be both read and write.
* If the access relation is empty, then the output dimension may
* not be equal to the dimension of the corresponding array.
*/
struct gpu_stmt_access {
/* Access reads elements */
int read;
/* Access writes elements */
int write;
/* All writes are definite writes. */
int exact_write;
/* Is a single, fixed element being accessed? */
isl_bool fixed_element;
/* The number of index expressions specified in the access. */
int n_index;
/* May access relation */
isl_map *access;
/* May access relation with as domain a mapping from iteration domain
* to a reference identifier.
*/
isl_map *tagged_access;
/* The reference id of the corresponding pet_expr. */
isl_id *ref_id;
struct gpu_stmt_access *next;
};
/* A representation of a user statement.
* "stmt" points to the corresponding pet statement.
* "id" is the identifier of the instance set of the statement.
* "accesses" is a linked list of accesses performed by the statement.
* If the statement has been killed, i.e., if it will not be scheduled,
* then this linked list may be empty even if the actual statement does
* perform accesses.
*/
struct gpu_stmt {
isl_id *id;
struct pet_stmt *stmt;
struct gpu_stmt_access *accesses;
};
/* Represents an outer array possibly accessed by a gpu_prog.
*/
struct gpu_array_info {
/* The array data space. */
isl_space *space;
/* Element type. */
char *type;
/* Element size. */
int size;
/* Name of the array. */
char *name;
/* Declared extent of original array. */
isl_set *declared_extent;
/* AST expression for declared size of original array. */
isl_ast_expr *declared_size;
/* Extent of the array that needs to be copied. */
isl_set *extent;
/* Number of indices. */
unsigned n_index;
/* For each index, a bound on "extent" in that direction. */
isl_multi_pw_aff *bound;
/* The corresponding access AST expression, if the array needs
* to be allocated on the device.
*/
isl_ast_expr *bound_expr;
/* All references to this array; point to elements of a linked list. */
int n_ref;
struct gpu_stmt_access **refs;
/* Is this array accessed at all by the program? */
int accessed;
/* Is this a scalar that is read-only within the entire program? */
int read_only_scalar;
/* Are the elements of the array structures? */
int has_compound_element;
/* Are the elements only accessed through constant index expressions? */
int only_fixed_element;
/* Is the array local to the scop? */
int local;
/* Is the array local and should it be declared on the host? */
int declare_local;
/* Is the corresponding global device memory accessed in any way? */
int global;
/* Should the array be linearized? */
int linearize;
/* Order dependences on this array.
* Only used if live_range_reordering option is set.
* It is set to NULL otherwise.
*/
isl_union_map *dep_order;
void *user;
};
/* Represents an outer array accessed by a ppcg_kernel, localized
* to the context of this kernel.
*
* "array" points to the corresponding array in the gpu_prog.
* The "n_group" "groups" are the reference groups associated to the array.
* If "force_private" is set, then the array (in practice a scalar)
* must be mapped to a register.
* "global" is set if the global device memory corresponding
* to this array is accessed by the kernel.
* "bound" is equal to array->bound specialized to the current kernel.
* "bound_expr" is the corresponding access AST expression.
*/
struct gpu_local_array_info {
struct gpu_array_info *array;
int n_group;
struct gpu_array_ref_group **groups;
int force_private;
int global;
unsigned n_index;
isl_multi_pw_aff *bound;
isl_ast_expr *bound_expr;
};
__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);
/* A sequence of "n" names of types.
*/
struct gpu_types {
int n;
char **name;
};
/* "read" and "write" contain the original access relations, possibly
* involving member accesses.
*
* The elements of "array", as well as the ranges of "copy_in" and "copy_out"
* only refer to the outer arrays of any possible member accesses.
*/
struct gpu_prog {
isl_ctx *ctx;
struct ppcg_scop *scop;
/* Set of parameter values */
isl_set *context;
/* All potential read accesses in the entire program */
isl_union_map *read;
/* All potential write accesses in the entire program */
isl_union_map *may_write;
/* All definite write accesses in the entire program */
isl_union_map *must_write;
/* All tagged definite kills in the entire program */
isl_union_map *tagged_must_kill;
/* The set of inner array elements that may be preserved. */
isl_union_set *may_persist;
/* A mapping from all innermost arrays to their outer arrays. */
isl_union_map *to_outer;
/* A mapping from the outer arrays to all corresponding inner arrays. */
isl_union_map *to_inner;
/* A mapping from all intermediate arrays to their outer arrays,
* including an identity mapping from the anonymous 1D space to itself.
*/
isl_union_map *any_to_outer;
/* Order dependences on non-scalars. */
isl_union_map *array_order;
/* Array of statements */
int n_stmts;
struct gpu_stmt *stmts;
int n_array;
struct gpu_array_info *array;
};
struct gpu_gen {
isl_ctx *ctx;
struct ppcg_options *options;
/* Callback for printing of AST in appropriate format. */
__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
struct gpu_types *types, void *user);
void *print_user;
isl_id_to_ast_expr *(*build_ast_expr)(void *stmt,
isl_ast_build *build,
isl_multi_pw_aff *(*fn_index)(
__isl_take isl_multi_pw_aff *mpa, isl_id *id,
void *user),
void *user_index,
isl_ast_expr *(*fn_expr)(isl_ast_expr *expr,
isl_id *id, void *user),
void *user_expr);
struct gpu_prog *prog;
/* The generated AST. */
isl_ast_node *tree;
/* The sequence of types for which a definition has been printed. */
struct gpu_types types;
/* User specified tile, grid and block sizes for each kernel */
isl_union_map *sizes;
/* Effectively used tile, grid and block sizes for each kernel */
isl_union_map *used_sizes;
/* Identifier of the next kernel. */
int kernel_id;
};
enum ppcg_group_access_type {
ppcg_access_global,
ppcg_access_shared,
ppcg_access_private
};
enum ppcg_kernel_stmt_type {
ppcg_kernel_copy,
ppcg_kernel_domain,
ppcg_kernel_sync
};
/* Representation of special statements, in particular copy statements
* and __syncthreads statements, inside a kernel.
*
* type represents the kind of statement
*
*
* for ppcg_kernel_copy statements we have
*
* read is set if the statement should copy data from global memory
* to shared memory or registers.
*
* index expresses an access to the array element that needs to be copied
* local_index expresses the corresponding element in the tile
*
* array refers to the original array being copied
* local_array is a pointer to the appropriate element in the "array"
* array of the ppcg_kernel to which this copy access belongs
*
*
* for ppcg_kernel_domain statements we have
*
* stmt is the corresponding input statement
*
* n_access is the number of accesses in stmt
* access is an array of local information about the accesses
*/
struct ppcg_kernel_stmt {
enum ppcg_kernel_stmt_type type;
union {
struct {
int read;
isl_ast_expr *index;
isl_ast_expr *local_index;
struct gpu_array_info *array;
struct gpu_local_array_info *local_array;
} c;
struct {
struct gpu_stmt *stmt;
isl_id_to_ast_expr *ref2expr;
} d;
} u;
};
/* Representation of a local variable in a kernel.
*/
struct ppcg_kernel_var {
struct gpu_array_info *array;
enum ppcg_group_access_type type;
char *name;
isl_vec *size;
};
/* Representation of a kernel.
*
* prog describes the original code from which the kernel is extracted.
*
* id is the sequence number of the kernel.
*
* block_ids contains the list of block identifiers for this kernel.
* thread_ids contains the list of thread identifiers for this kernel.
*
* the first n_grid elements of grid_dim represent the specified size
* of the grid.
* the first n_block elements of block_dim represent the specified or
* effective size of the block.
* Note that in the input file, the sizes of the grid and the blocks
* are specified in the order x, y, z, but internally, the sizes
* are stored in reverse order, so that the last element always
* refers to the x dimension.
*
* grid_size reflects the effective grid size.
* grid_size_expr contains a corresponding access AST expression, built within
* the context where the launch appears.
*
* context contains the values of the parameters and outer schedule dimensions
* for which any statement instance in this kernel needs to be executed.
*
* n_sync is the number of synchronization operations that have
* been introduced in the schedule tree corresponding to this kernel (so far).
*
* core contains the spaces of the statement domains that form
* the core computation of the kernel. It is used to navigate
* the tree during the construction of the device part of the schedule
* tree in gpu_create_kernel.
*
* expanded_domain contains the original statement instances,
* i.e., those that appear in the domains of access relations,
* that are involved in the kernel.
* contraction maps those original statement instances to
* the statement instances that are active at the point
* in the schedule tree where the kernel is created.
*
* arrays is the set of possibly accessed outer array elements.
*
* space is the schedule space of the AST context. That is, it represents
* the loops of the generated host code containing the kernel launch.
*
* n_array is the total number of arrays in the input program and also
* the number of element in the array array.
* array contains information about each array that is local
* to the current kernel. If an array is not used in a kernel,
* then the corresponding entry does not contain any information.
*
* any_force_private is set if any array in the kernel is marked force_private
*
* block_filter contains constraints on the domain elements in the kernel
* that encode the mapping to block identifiers, where the block identifiers
* are represented by "n_grid" parameters with as names the elements
* of "block_ids".
*
* thread_filter contains constraints on the domain elements in the kernel
* that encode the mapping to thread identifiers, where the thread identifiers
* are represented by "n_block" parameters with as names the elements
* of "thread_ids".
*
* copy_schedule corresponds to the schedule dimensions of
* the (tiled) schedule for this kernel that have been taken into account
* for computing private/shared memory tiles.
* The domain corresponds to the original statement instances, i.e.,
* those that appear in the leaves of the schedule tree.
* copy_schedule_dim is the dimension of this schedule.
*
* sync_writes contains write references that require synchronization.
* Each reference is represented by a universe set in a space [S[i,j] -> R[]]
* with S[i,j] the statement instance space and R[] the array reference.
*/
struct ppcg_kernel {
isl_ctx *ctx;
struct ppcg_options *options;
struct gpu_prog *prog;
int id;
isl_id_list *block_ids;
isl_id_list *thread_ids;
int n_grid;
int n_block;
int grid_dim[2];
int block_dim[3];
isl_multi_pw_aff *grid_size;
isl_ast_expr *grid_size_expr;
isl_set *context;
int n_sync;
isl_union_set *core;
isl_union_set *arrays;
isl_union_pw_multi_aff *contraction;
isl_union_set *expanded_domain;
isl_space *space;
int n_array;
struct gpu_local_array_info *array;
int n_var;
struct ppcg_kernel_var *var;
int any_force_private;
isl_union_set *block_filter;
isl_union_set *thread_filter;
isl_union_pw_multi_aff *copy_schedule;
int copy_schedule_dim;
isl_union_set *sync_writes;
isl_ast_node *tree;
};
int gpu_array_is_scalar(struct gpu_array_info *array);
int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
int gpu_array_requires_device_allocation(struct gpu_array_info *array);
__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
isl_bool gpu_array_can_be_private(struct gpu_array_info *array);
struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
void *gpu_prog_free(struct gpu_prog *prog);
int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);
int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
struct ppcg_options *options,
__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
struct gpu_types *types, void *user), void *user);
__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
__isl_take isl_schedule_node *node, int scale,
__isl_keep isl_multi_val *sizes);
__isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
int has_any_permutable_node(__isl_keep isl_schedule *schedule);
__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
__isl_take isl_schedule *schedule,
int to_from_device);
__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
__isl_take isl_schedule *schedule);
__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
void collect_order_dependences(struct gpu_prog *prog);
isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
#endif

View File

@ -1,71 +0,0 @@
#include <isl/aff.h>
#include <isl/map.h>
#include "gpu_array_tile.h"
struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile)
{
int j;
if (!tile)
return NULL;
for (j = 0; j < tile->n; ++j) {
isl_val_free(tile->bound[j].size);
isl_val_free(tile->bound[j].stride);
isl_aff_free(tile->bound[j].lb);
isl_aff_free(tile->bound[j].shift);
}
free(tile->bound);
isl_multi_aff_free(tile->tiling);
free(tile);
return NULL;
}
/* Create a gpu_array_tile for an array of dimension "n_index".
*/
struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index)
{
int i;
struct gpu_array_tile *tile;
tile = isl_calloc_type(ctx, struct gpu_array_tile);
if (!tile)
return NULL;
tile->ctx = ctx;
tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
if (!tile->bound)
return gpu_array_tile_free(tile);
tile->n = n_index;
for (i = 0; i < n_index; ++i) {
tile->bound[i].size = NULL;
tile->bound[i].lb = NULL;
tile->bound[i].stride = NULL;
tile->bound[i].shift = NULL;
}
return tile;
}
/* Compute the size of the tile specified by "tile"
* in number of elements and return the result.
*/
__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile)
{
int i;
isl_val *size;
if (!tile)
return NULL;
size = isl_val_one(tile->ctx);
for (i = 0; i < tile->n; ++i)
size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
return size;
}

View File

@ -1,59 +0,0 @@
#ifndef GPU_ARRAY_TILE_H
#define GPU_ARRAY_TILE_H
#include <isl/aff_type.h>
#include <isl/map_type.h>
#include <isl/val.h>
/* The fields stride and shift only contain valid information
* if shift != NULL.
* If so, they express that current index is such that if you add shift,
* then the result is always a multiple of stride.
* Let D represent the initial tile->depth dimensions of the computed schedule.
* The spaces of "lb" and "shift" are of the form
*
* D -> [b]
*/
struct gpu_array_bound {
isl_val *size;
isl_aff *lb;
isl_val *stride;
isl_aff *shift;
};
/* A tile of an outer array.
*
* requires_unroll is set if the schedule dimensions that are mapped
* to threads need to be unrolled for this (private) tile to be used.
*
* "depth" reflects the number of schedule dimensions that affect the tile.
* The copying into and/or out of the tile is performed at that depth.
*
* n is the dimension of the array.
* bound is an array of size "n" representing the lower bound
* and size for each index.
*
* tiling maps a tile in the global array to the corresponding
* shared/private memory tile and is of the form
*
* { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
*
* where D represents the initial "depth" dimensions
* of the computed schedule.
*/
struct gpu_array_tile {
isl_ctx *ctx;
int requires_unroll;
int depth;
int n;
struct gpu_array_bound *bound;
isl_multi_aff *tiling;
};
struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index);
struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile);
__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,65 +0,0 @@
#ifndef GPU_GROUP_H
#define GPU_GROUP_H
#include <isl/schedule_node.h>
#include "gpu.h"
/* A group of array references in a kernel that should be handled together.
* If private_tile is not NULL, then it is mapped to registers.
* Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
* Otherwise, it is accessed from global memory.
* Note that if both private_tile and shared_tile are set, then shared_tile
* is only used inside group_common_shared_memory_tile.
*/
struct gpu_array_ref_group {
/* The references in this group access this local array. */
struct gpu_local_array_info *local_array;
/* This is the corresponding array. */
struct gpu_array_info *array;
/* Position of this group in the list of reference groups of array. */
int nr;
/* The following fields are use during the construction of the groups.
* access is the combined access relation relative to the private
* memory tiling. In particular, the domain of the map corresponds
* to the first thread_depth dimensions of the kernel schedule.
* write is set if any access in the group is a write.
* exact_write is set if all writes are definite writes.
* slice is set if there is at least one access in the group
* that refers to more than one element
* "min_depth" is the minimum of the tile depths and thread_depth.
*/
isl_map *access;
int write;
int exact_write;
int slice;
int min_depth;
/* The shared memory tile, NULL if none. */
struct gpu_array_tile *shared_tile;
/* The private memory tile, NULL if none. */
struct gpu_array_tile *private_tile;
/* References in this group; point to elements of a linked list. */
int n_ref;
struct gpu_stmt_access **refs;
};
int gpu_group_references(struct ppcg_kernel *kernel,
__isl_keep isl_schedule_node *node);
__isl_give isl_printer *gpu_array_ref_group_print_name(
struct gpu_array_ref_group *group, __isl_take isl_printer *p);
void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group);
__isl_give isl_union_map *gpu_array_ref_group_access_relation(
struct gpu_array_ref_group *group, int read, int write);
int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group);
enum ppcg_group_access_type gpu_array_ref_group_type(
struct gpu_array_ref_group *group);
struct gpu_array_tile *gpu_array_ref_group_tile(
struct gpu_array_ref_group *group);
struct gpu_array_ref_group *gpu_array_ref_group_free(
struct gpu_array_ref_group *group);
#endif

View File

@ -1,146 +0,0 @@
/*
* Copyright 2013 Ecole Normale Superieure
* Copyright 2015 Sven Verdoolaege
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege,
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
*/
#include <string.h>
#include <isl/val.h>
#include <isl/space.h>
#include <isl/union_set.h>
#include <isl/schedule_node.h>
#include "hybrid.h"
#include "gpu_hybrid.h"
#include "gpu_tree.h"
#include "schedule.h"
#include "util.h"
/* Have all domain elements been filtered out before reaching
* the "node" position in the schedule tree?
*/
static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node)
{
isl_union_set *domain;
isl_bool empty;
domain = isl_schedule_node_get_domain(node);
empty = isl_union_set_is_empty(domain);
isl_union_set_free(domain);
return empty;
}
/* Given a pointer to a phase in the result of hybrid tiling,
* map the phase to the device, provided the phase is non-empty.
* Empty phases can occur if the input schedule domain can be
* covered by a small number of hexagons that all belong to the same phase.
*
* The input has the following form:
*
* M - CT - P - C - ...
*
* with M the phase marker, CT the space tiling, P the original
* parent band and C the original child band.
* The (outer dimensions of the) C band need to be mapped to threads.
* The (outer dimension of the) CT band needs to be mapped to blocks.
* The mapping to shared memory needs to be computed between the CT and
* the P band.
*
* The C band is first shifted to start at zero.
* Then the appropriate markers are introduced and a kernel is
* created for the tree rooted at CT.
* If the "unroll_gpu_tile" option is set, then the AST generator
* is instructed to unroll the P and C bands.
*/
static __isl_give isl_schedule_node *update_phase(
__isl_take isl_schedule_node *node, void *user)
{
struct gpu_gen *gen = user;
int depth0, depth;
isl_ctx *ctx;
isl_id *id;
isl_bool empty_domain;
ppcg_ht_phase *phase;
empty_domain = has_empty_domain(node);
if (empty_domain < 0)
return isl_schedule_node_free(node);
if (empty_domain)
return node;
if (!node)
return NULL;
ctx = isl_schedule_node_get_ctx(node);
phase = ppcg_ht_phase_extract_from_mark(node);
depth0 = isl_schedule_node_get_tree_depth(node);
node = isl_schedule_node_child(node, 0);
node = isl_schedule_node_child(node, 0);
node = isl_schedule_node_child(node, 0);
node = ppcg_ht_phase_shift_space_point(phase, node);
if (gen->options->unroll_gpu_tile)
node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
id = isl_id_alloc(ctx, "thread", NULL);
node = isl_schedule_node_insert_mark(node, id);
node = isl_schedule_node_parent(node);
if (gen->options->unroll_gpu_tile)
node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
id = isl_id_alloc(ctx, "shared", NULL);
node = isl_schedule_node_insert_mark(node, id);
node = isl_schedule_node_parent(node);
node = gpu_create_kernel(gen, node, 0, NULL);
depth = isl_schedule_node_get_tree_depth(node);
node = isl_schedule_node_ancestor(node, depth - depth0);
return node;
}
/* Apply hybrid tiling on "node" and its parent based on the (valid)
* bounds on the relative dependence distances "bounds" and
* the tile sizes in "tile_sizes".
* The number of elements in "tile_sizes" is at least as large
* as the sum of the dimensions of the parent and the child node.
*
* Convert the tile_sizes to an isl_multi_val in the right space,
* insert the hybrid tiling and then create a kernel inside each phase.
* Finally, remove the phase marks.
*/
__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
int *tile_sizes)
{
isl_multi_val *mv;
isl_space *space, *space2;
if (!node || !bounds)
goto error;
space2 = isl_schedule_node_band_get_space(node);
node = isl_schedule_node_parent(node);
space = isl_schedule_node_band_get_space(node);
space = isl_space_product(space, space2);
mv = ppcg_multi_val_from_int_list(space, tile_sizes);
node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options);
node = hybrid_tile_foreach_phase(node, &update_phase, gen);
node = hybrid_tile_drop_phase_marks(node);
return node;
error:
isl_schedule_node_free(node);
ppcg_ht_bounds_free(bounds);
return NULL;
}

View File

@ -1,13 +0,0 @@
#ifndef GPU_HYBRID_H
#define GPU_HYBRID_H
#include <isl/schedule_node.h>
#include "gpu.h"
#include "hybrid.h"
__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
int *tile_sizes);
#endif

View File

@ -1,310 +0,0 @@
/*
* Copyright 2012 Ecole Normale Superieure
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege,
* Ecole Normale Superieure, 45 rue dUlm, 75230 Paris, France
*/
#include <string.h>
#include <isl/aff.h>
#include "gpu_print.h"
#include "print.h"
#include "schedule.h"
/* Print declarations to "p" for arrays that are local to "prog"
* but that are used on the host and therefore require a declaration.
*/
__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
struct gpu_prog *prog)
{
int i;
if (!prog)
return isl_printer_free(p);
for (i = 0; i < prog->n_array; ++i) {
struct gpu_array_info *array = &prog->array[i];
isl_ast_expr *size;
if (!array->declare_local)
continue;
size = array->declared_size;
p = ppcg_print_declaration_with_size(p, array->type, size);
}
return p;
}
/* Print an expression for the size of "array" in bytes.
*/
__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
struct gpu_array_info *array)
{
int i;
for (i = 0; i < array->n_index; ++i) {
isl_ast_expr *bound;
prn = isl_printer_print_str(prn, "(");
bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
prn = isl_printer_print_ast_expr(prn, bound);
isl_ast_expr_free(bound);
prn = isl_printer_print_str(prn, ") * ");
}
prn = isl_printer_print_str(prn, "sizeof(");
prn = isl_printer_print_str(prn, array->type);
prn = isl_printer_print_str(prn, ")");
return prn;
}
/* Print the declaration of a non-linearized array argument.
*/
static __isl_give isl_printer *print_non_linearized_declaration_argument(
__isl_take isl_printer *p, struct gpu_array_info *array)
{
p = isl_printer_print_str(p, array->type);
p = isl_printer_print_str(p, " ");
p = isl_printer_print_ast_expr(p, array->bound_expr);
return p;
}
/* Print the declaration of an array argument.
* "memory_space" allows to specify a memory space prefix.
*/
__isl_give isl_printer *gpu_array_info_print_declaration_argument(
__isl_take isl_printer *p, struct gpu_array_info *array,
const char *memory_space)
{
if (gpu_array_is_read_only_scalar(array)) {
p = isl_printer_print_str(p, array->type);
p = isl_printer_print_str(p, " ");
p = isl_printer_print_str(p, array->name);
return p;
}
if (memory_space) {
p = isl_printer_print_str(p, memory_space);
p = isl_printer_print_str(p, " ");
}
if (array->n_index != 0 && !array->linearize)
return print_non_linearized_declaration_argument(p, array);
p = isl_printer_print_str(p, array->type);
p = isl_printer_print_str(p, " ");
p = isl_printer_print_str(p, "*");
p = isl_printer_print_str(p, array->name);
return p;
}
/* Print the call of an array argument.
*/
__isl_give isl_printer *gpu_array_info_print_call_argument(
__isl_take isl_printer *p, struct gpu_array_info *array)
{
if (gpu_array_is_read_only_scalar(array))
return isl_printer_print_str(p, array->name);
p = isl_printer_print_str(p, "dev_");
p = isl_printer_print_str(p, array->name);
return p;
}
/* Print an access to the element in the private/shared memory copy
* described by "stmt". The index of the copy is recorded in
* stmt->local_index as an access to the array.
*/
static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
struct ppcg_kernel_stmt *stmt)
{
return isl_printer_print_ast_expr(p, stmt->u.c.local_index);
}
/* Print an access to the element in the global memory copy
* described by "stmt". The index of the copy is recorded in
* stmt->index as an access to the array.
*/
static __isl_give isl_printer *stmt_print_global_index(
__isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
{
struct gpu_array_info *array = stmt->u.c.array;
isl_ast_expr *index;
if (gpu_array_is_scalar(array)) {
if (!gpu_array_is_read_only_scalar(array))
p = isl_printer_print_str(p, "*");
p = isl_printer_print_str(p, array->name);
return p;
}
index = isl_ast_expr_copy(stmt->u.c.index);
p = isl_printer_print_ast_expr(p, index);
isl_ast_expr_free(index);
return p;
}
/* Print a copy statement.
*
* A read copy statement is printed as
*
* local = global;
*
* while a write copy statement is printed as
*
* global = local;
*/
__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
struct ppcg_kernel_stmt *stmt)
{
p = isl_printer_start_line(p);
if (stmt->u.c.read) {
p = stmt_print_local_index(p, stmt);
p = isl_printer_print_str(p, " = ");
p = stmt_print_global_index(p, stmt);
} else {
p = stmt_print_global_index(p, stmt);
p = isl_printer_print_str(p, " = ");
p = stmt_print_local_index(p, stmt);
}
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
return p;
}
__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
struct ppcg_kernel_stmt *stmt)
{
return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);
}
/* This function is called for each node in a GPU AST.
* In case of a user node, print the macro definitions required
* for printing the AST expressions in the annotation, if any.
* For other nodes, return true such that descendants are also
* visited.
*
* In particular, for a kernel launch, print the macro definitions
* needed for the grid size.
* For a copy statement, print the macro definitions needed
* for the two index expressions.
* For an original user statement, print the macro definitions
* needed for the substitutions.
*/
static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
{
const char *name;
isl_id *id;
int is_kernel;
struct ppcg_kernel *kernel;
struct ppcg_kernel_stmt *stmt;
isl_printer **p = user;
if (isl_ast_node_get_type(node) != isl_ast_node_user)
return isl_bool_true;
id = isl_ast_node_get_annotation(node);
if (!id)
return isl_bool_false;
name = isl_id_get_name(id);
if (!name)
return isl_bool_error;
is_kernel = !strcmp(name, "kernel");
kernel = is_kernel ? isl_id_get_user(id) : NULL;
stmt = is_kernel ? NULL : isl_id_get_user(id);
isl_id_free(id);
if ((is_kernel && !kernel) || (!is_kernel && !stmt))
return isl_bool_error;
if (is_kernel) {
*p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p);
} else if (stmt->type == ppcg_kernel_copy) {
*p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p);
*p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p);
} else if (stmt->type == ppcg_kernel_domain) {
*p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr);
}
if (!*p)
return isl_bool_error;
return isl_bool_false;
}
/* Print the required macros for the GPU AST "node" to "p",
* including those needed for the user statements inside the AST.
*/
__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
__isl_keep isl_ast_node *node)
{
if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
return isl_printer_free(p);
p = ppcg_print_macros(p, node);
return p;
}
/* Was the definition of "type" printed before?
* That is, does its name appear in the list of printed types "types"?
*/
static int already_printed(struct gpu_types *types,
struct pet_type *type)
{
int i;
for (i = 0; i < types->n; ++i)
if (!strcmp(types->name[i], type->name))
return 1;
return 0;
}
/* Print the definitions of all types prog->scop that have not been
* printed before (according to "types") on "p".
* Extend the list of printed types "types" with the newly printed types.
*/
__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
struct gpu_types *types, struct gpu_prog *prog)
{
int i, n;
isl_ctx *ctx;
char **name;
n = prog->scop->pet->n_type;
if (n == 0)
return p;
ctx = isl_printer_get_ctx(p);
name = isl_realloc_array(ctx, types->name, char *, types->n + n);
if (!name)
return isl_printer_free(p);
types->name = name;
for (i = 0; i < n; ++i) {
struct pet_type *type = prog->scop->pet->types[i];
if (already_printed(types, type))
continue;
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, type->definition);
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
types->name[types->n++] = strdup(type->name);
}
return p;
}

View File

@ -1,28 +0,0 @@
#ifndef GPU_PRINT_H
#define GPU_PRINT_H
#include "gpu.h"
__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
struct gpu_prog *prog);
__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
struct gpu_types *types, struct gpu_prog *prog);
__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
__isl_keep isl_ast_node *node);
__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
struct gpu_array_info *array);
__isl_give isl_printer *gpu_array_info_print_declaration_argument(
__isl_take isl_printer *p, struct gpu_array_info *array,
const char *memory_space);
__isl_give isl_printer *gpu_array_info_print_call_argument(
__isl_take isl_printer *p, struct gpu_array_info *array);
__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
struct ppcg_kernel_stmt *stmt);
__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
struct ppcg_kernel_stmt *stmt);
#endif

View File

@ -1,640 +0,0 @@
/*
* Copyright 2013 Ecole Normale Superieure
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege,
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
*/
#include <string.h>
#include <isl/set.h>
#include <isl/union_set.h>
#include <isl/space.h>
#include "gpu_tree.h"
/* The functions in this file are used to navigate part of a schedule tree
* that is mapped to blocks. Initially, this part consists of a linear
* branch segment with a mark node with name "kernel" on the outer end
* and a mark node with name "thread" on the inner end.
* During the mapping to blocks, branching may be introduced, but only
* one of the elements in each sequence contains the "thread" mark.
* The filter of this element (and only this filter) contains
* domain elements identified by the "core" argument of the functions
* that move down this tree.
*
* Synchronization statements have a name that starts with "sync" and
* a user pointer pointing to the kernel that contains the synchronization.
* The functions inserting or detecting synchronizations take a ppcg_kernel
* argument to be able to create or identify such statements.
* They may also use two fields in this structure, the "core" field
* to move around in the tree and the "n_sync" field to make sure that
* each synchronization has a different name (within the kernel).
*/
/* Is "node" a mark node with an identifier called "name"?
*/
static int is_marked(__isl_keep isl_schedule_node *node, const char *name)
{
isl_id *mark;
int has_name;
if (!node)
return -1;
if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
return 0;
mark = isl_schedule_node_mark_get_id(node);
if (!mark)
return -1;
has_name = !strcmp(isl_id_get_name(mark), name);
isl_id_free(mark);
return has_name;
}
/* Is "node" a mark node with an identifier called "kernel"?
*/
int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node)
{
return is_marked(node, "kernel");
}
/* Is "node" a mark node with an identifier called "shared"?
*/
static int node_is_shared(__isl_keep isl_schedule_node *node)
{
return is_marked(node, "shared");
}
/* Is "node" a mark node with an identifier called "thread"?
*/
static int node_is_thread(__isl_keep isl_schedule_node *node)
{
return is_marked(node, "thread");
}
/* Insert a mark node with identifier "shared" in front of "node".
*/
static __isl_give isl_schedule_node *insert_shared(
__isl_take isl_schedule_node *node)
{
isl_ctx *ctx;
isl_id *id;
ctx = isl_schedule_node_get_ctx(node);
id = isl_id_alloc(ctx, "shared", NULL);
node = isl_schedule_node_insert_mark(node, id);
return node;
}
/* Insert a "shared" mark in front of the "thread" mark
* provided the linear branch between "node" and the "thread" mark
* does not contain such a "shared" mark already.
*
* As a side effect, this function checks that the subtree at "node"
* actually contains a "thread" mark and that there is no branching
* in between "node" and this "thread" mark.
*/
__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
__isl_take isl_schedule_node *node)
{
int depth0, depth;
int any_shared = 0;
if (!node)
return NULL;
depth0 = isl_schedule_node_get_tree_depth(node);
for (;;) {
int is_thread;
int n;
if (!any_shared) {
any_shared = node_is_shared(node);
if (any_shared < 0)
return isl_schedule_node_free(node);
}
is_thread = node_is_thread(node);
if (is_thread < 0)
return isl_schedule_node_free(node);
if (is_thread)
break;
n = isl_schedule_node_n_children(node);
if (n == 0)
isl_die(isl_schedule_node_get_ctx(node),
isl_error_invalid,
"no thread marker found",
return isl_schedule_node_free(node));
if (n > 1)
isl_die(isl_schedule_node_get_ctx(node),
isl_error_invalid,
"expecting single thread marker",
return isl_schedule_node_free(node));
node = isl_schedule_node_child(node, 0);
}
if (!any_shared)
node = insert_shared(node);
depth = isl_schedule_node_get_tree_depth(node);
node = isl_schedule_node_ancestor(node, depth - depth0);
return node;
}
/* Assuming "node" is a filter node, does it correspond to the branch
* that contains the "thread" mark, i.e., does it contain any elements
* in "core"?
*/
static int node_is_core(__isl_keep isl_schedule_node *node,
__isl_keep isl_union_set *core)
{
int disjoint;
isl_union_set *filter;
filter = isl_schedule_node_filter_get_filter(node);
disjoint = isl_union_set_is_disjoint(filter, core);
isl_union_set_free(filter);
if (disjoint < 0)
return -1;
return !disjoint;
}
/* Move to the only child of "node" that has the "thread" mark as descendant,
* where the branch containing this mark is identified by the domain elements
* in "core".
*
* If "node" is not a sequence, then it only has one child and we move
* to that single child.
* Otherwise, we check each of the filters in the children, pick
* the one that corresponds to "core" and return a pointer to the child
* of the filter node.
*/
static __isl_give isl_schedule_node *core_child(
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
int i, n;
if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
return isl_schedule_node_child(node, 0);
n = isl_schedule_node_n_children(node);
for (i = 0; i < n; ++i) {
int is_core;
node = isl_schedule_node_child(node, i);
is_core = node_is_core(node, core);
if (is_core < 0)
return isl_schedule_node_free(node);
if (is_core)
return isl_schedule_node_child(node, 0);
node = isl_schedule_node_parent(node);
}
isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
"core child not found", return isl_schedule_node_free(node));
}
/* Move down the branch between "kernel" and "thread" until
* the "shared" mark is reached, where the branch containing the "shared"
* mark is identified by the domain elements in "core".
*/
__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
int is_shared;
while ((is_shared = node_is_shared(node)) == 0)
node = core_child(node, core);
if (is_shared < 0)
node = isl_schedule_node_free(node);
return node;
}
/* Move down the branch between "kernel" and "thread" until
* the "thread" mark is reached, where the branch containing the "thread"
* mark is identified by the domain elements in "core".
*/
__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
{
int is_thread;
while ((is_thread = node_is_thread(node)) == 0)
node = core_child(node, core);
if (is_thread < 0)
node = isl_schedule_node_free(node);
return node;
}
/* Move up the tree underneath the "thread" mark until
* the "thread" mark is reached.
*/
__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
__isl_take isl_schedule_node *node)
{
int is_thread;
while ((is_thread = node_is_thread(node)) == 0)
node = isl_schedule_node_parent(node);
if (is_thread < 0)
node = isl_schedule_node_free(node);
return node;
}
/* Move up the tree underneath the "kernel" mark until
* the "kernel" mark is reached.
*/
__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
__isl_take isl_schedule_node *node)
{
int is_kernel;
while ((is_kernel = gpu_tree_node_is_kernel(node)) == 0)
node = isl_schedule_node_parent(node);
if (is_kernel < 0)
node = isl_schedule_node_free(node);
return node;
}
/* Move down from the "kernel" mark (or at least a node with schedule
* depth smaller than or equal to "depth") to a band node at schedule
* depth "depth". The "thread" mark is assumed to have a schedule
* depth greater than or equal to "depth". The branch containing the
* "thread" mark is identified by the domain elements in "core".
*
* If the desired schedule depth is in the middle of band node,
* then the band node is split into two pieces, the second piece
* at the desired schedule depth.
*/
__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
__isl_take isl_schedule_node *node, int depth,
__isl_keep isl_union_set *core)
{
int is_shared;
int is_thread = 0;
while (node && isl_schedule_node_get_schedule_depth(node) < depth) {
if (isl_schedule_node_get_type(node) ==
isl_schedule_node_band) {
int node_depth, node_dim;
node_depth = isl_schedule_node_get_schedule_depth(node);
node_dim = isl_schedule_node_band_n_member(node);
if (node_depth + node_dim > depth)
node = isl_schedule_node_band_split(node,
depth - node_depth);
}
node = core_child(node, core);
}
while ((is_shared = node_is_shared(node)) == 0 &&
(is_thread = node_is_thread(node)) == 0 &&
isl_schedule_node_get_type(node) != isl_schedule_node_band)
node = core_child(node, core);
if (is_shared < 0 || is_thread < 0)
node = isl_schedule_node_free(node);
return node;
}
/* Create a union set containing a single set with a tuple identifier
* called "syncX" and user pointer equal to "kernel".
*/
static __isl_give isl_union_set *create_sync_domain(struct ppcg_kernel *kernel)
{
isl_space *space;
isl_id *id;
char name[40];
space = isl_space_set_alloc(kernel->ctx, 0, 0);
snprintf(name, sizeof(name), "sync%d", kernel->n_sync++);
id = isl_id_alloc(kernel->ctx, name, kernel);
space = isl_space_set_tuple_id(space, isl_dim_set, id);
return isl_union_set_from_set(isl_set_universe(space));
}
/* Is "id" the identifier of a synchronization statement inside "kernel"?
* That is, does its name start with "sync" and does it point to "kernel"?
*/
int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel)
{
const char *name;
name = isl_id_get_name(id);
if (!name)
return 0;
else if (strncmp(name, "sync", 4))
return 0;
return isl_id_get_user(id) == kernel;
}
/* Does "domain" consist of a single set with a tuple identifier
* corresponding to a synchronization for "kernel"?
*/
static int domain_is_sync(__isl_keep isl_union_set *domain,
struct ppcg_kernel *kernel)
{
int is_sync;
isl_id *id;
isl_set *set;
if (isl_union_set_n_set(domain) != 1)
return 0;
set = isl_set_from_union_set(isl_union_set_copy(domain));
id = isl_set_get_tuple_id(set);
is_sync = gpu_tree_id_is_sync(id, kernel);
isl_id_free(id);
isl_set_free(set);
return is_sync;
}
/* Does "node" point to a filter selecting a synchronization statement
* for "kernel"?
*/
static int node_is_sync_filter(__isl_keep isl_schedule_node *node,
struct ppcg_kernel *kernel)
{
int is_sync;
enum isl_schedule_node_type type;
isl_union_set *domain;
if (!node)
return -1;
type = isl_schedule_node_get_type(node);
if (type != isl_schedule_node_filter)
return 0;
domain = isl_schedule_node_filter_get_filter(node);
is_sync = domain_is_sync(domain, kernel);
isl_union_set_free(domain);
return is_sync;
}
/* Is "node" part of a sequence with a previous synchronization statement
* for "kernel"?
* That is, is the parent of "node" a filter such that there is
* a previous filter that picks out exactly such a synchronization statement?
*/
static int has_preceding_sync(__isl_keep isl_schedule_node *node,
struct ppcg_kernel *kernel)
{
int found = 0;
node = isl_schedule_node_copy(node);
node = isl_schedule_node_parent(node);
while (!found && isl_schedule_node_has_previous_sibling(node)) {
node = isl_schedule_node_previous_sibling(node);
if (!node)
break;
found = node_is_sync_filter(node, kernel);
}
if (!node)
found = -1;
isl_schedule_node_free(node);
return found;
}
/* Is "node" part of a sequence with a subsequent synchronization statement
* for "kernel"?
* That is, is the parent of "node" a filter such that there is
* a subsequent filter that picks out exactly such a synchronization statement?
*/
static int has_following_sync(__isl_keep isl_schedule_node *node,
struct ppcg_kernel *kernel)
{
int found = 0;
node = isl_schedule_node_copy(node);
node = isl_schedule_node_parent(node);
while (!found && isl_schedule_node_has_next_sibling(node)) {
node = isl_schedule_node_next_sibling(node);
if (!node)
break;
found = node_is_sync_filter(node, kernel);
}
if (!node)
found = -1;
isl_schedule_node_free(node);
return found;
}
/* Does the subtree rooted at "node" (which is a band node) contain
* any synchronization statement for "kernel" that precedes
* the core computation of "kernel" (identified by the elements
* in kernel->core)?
*/
static int has_sync_before_core(__isl_keep isl_schedule_node *node,
struct ppcg_kernel *kernel)
{
int has_sync = 0;
int is_thread;
node = isl_schedule_node_copy(node);
while ((is_thread = node_is_thread(node)) == 0) {
node = core_child(node, kernel->core);
has_sync = has_preceding_sync(node, kernel);
if (has_sync < 0 || has_sync)
break;
}
if (is_thread < 0 || !node)
has_sync = -1;
isl_schedule_node_free(node);
return has_sync;
}
/* Does the subtree rooted at "node" (which is a band node) contain
* any synchronization statement for "kernel" that follows
* the core computation of "kernel" (identified by the elements
* in kernel->core)?
*/
static int has_sync_after_core(__isl_keep isl_schedule_node *node,
struct ppcg_kernel *kernel)
{
int has_sync = 0;
int is_thread;
node = isl_schedule_node_copy(node);
while ((is_thread = node_is_thread(node)) == 0) {
node = core_child(node, kernel->core);
has_sync = has_following_sync(node, kernel);
if (has_sync < 0 || has_sync)
break;
}
if (is_thread < 0 || !node)
has_sync = -1;
isl_schedule_node_free(node);
return has_sync;
}
/* Insert (or extend) an extension on top of "node" that puts
* a synchronization node for "kernel" before "node".
* Return a pointer to the original node in the updated schedule tree.
*/
static __isl_give isl_schedule_node *insert_sync_before(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
isl_union_set *domain;
isl_schedule_node *graft;
if (!node)
return NULL;
domain = create_sync_domain(kernel);
graft = isl_schedule_node_from_domain(domain);
node = isl_schedule_node_graft_before(node, graft);
return node;
}
/* Insert (or extend) an extension on top of "node" that puts
* a synchronization node for "kernel" afater "node".
* Return a pointer to the original node in the updated schedule tree.
*/
static __isl_give isl_schedule_node *insert_sync_after(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
isl_union_set *domain;
isl_schedule_node *graft;
if (!node)
return NULL;
domain = create_sync_domain(kernel);
graft = isl_schedule_node_from_domain(domain);
node = isl_schedule_node_graft_after(node, graft);
return node;
}
/* Insert an extension on top of "node" that puts a synchronization node
* for "kernel" before "node" unless there already is
* such a synchronization node.
*/
__isl_give isl_schedule_node *gpu_tree_ensure_preceding_sync(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
int has_sync;
has_sync = has_preceding_sync(node, kernel);
if (has_sync < 0)
return isl_schedule_node_free(node);
if (has_sync)
return node;
return insert_sync_before(node, kernel);
}
/* Insert an extension on top of "node" that puts a synchronization node
* for "kernel" after "node" unless there already is
* such a synchronization node.
*/
__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
int has_sync;
has_sync = has_following_sync(node, kernel);
if (has_sync < 0)
return isl_schedule_node_free(node);
if (has_sync)
return node;
return insert_sync_after(node, kernel);
}
/* Insert an extension on top of "node" that puts a synchronization node
* for "kernel" after "node" unless there already is such a sync node or
* "node" itself already * contains a synchronization node following
* the core computation of "kernel".
*/
__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
int has_sync;
has_sync = has_sync_after_core(node, kernel);
if (has_sync < 0)
return isl_schedule_node_free(node);
if (has_sync)
return node;
has_sync = has_following_sync(node, kernel);
if (has_sync < 0)
return isl_schedule_node_free(node);
if (has_sync)
return node;
return insert_sync_after(node, kernel);
}
/* Move left in the sequence on top of "node" to a synchronization node
* for "kernel".
* If "node" itself contains a synchronization node preceding
* the core computation of "kernel", then return "node" itself.
* Otherwise, if "node" does not have a preceding synchronization node,
* then create one first.
*/
__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
int has_sync;
int is_sync;
has_sync = has_sync_before_core(node, kernel);
if (has_sync < 0)
return isl_schedule_node_free(node);
if (has_sync)
return node;
node = gpu_tree_ensure_preceding_sync(node, kernel);
node = isl_schedule_node_parent(node);
while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
node = isl_schedule_node_previous_sibling(node);
if (is_sync < 0)
node = isl_schedule_node_free(node);
node = isl_schedule_node_child(node, 0);
return node;
}
/* Move right in the sequence on top of "node" to a synchronization node
* for "kernel".
* If "node" itself contains a synchronization node following
* the core computation of "kernel", then return "node" itself.
* Otherwise, if "node" does not have a following synchronization node,
* then create one first.
*/
__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
{
int has_sync;
int is_sync;
has_sync = has_sync_after_core(node, kernel);
if (has_sync < 0)
return isl_schedule_node_free(node);
if (has_sync)
return node;
node = gpu_tree_ensure_following_sync(node, kernel);
node = isl_schedule_node_parent(node);
while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
node = isl_schedule_node_next_sibling(node);
if (is_sync < 0)
node = isl_schedule_node_free(node);
node = isl_schedule_node_child(node, 0);
return node;
}

View File

@ -1,33 +0,0 @@
#ifndef GPU_TREE_H
#define GPU_TREE_H
#include <isl/schedule_node.h>
#include "gpu.h"
__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
__isl_take isl_schedule_node *node);
int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node);
__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
__isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
__isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
__isl_take isl_schedule_node *node, int depth,
__isl_keep isl_union_set *core);
int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
#endif

View File

@ -1,684 +0,0 @@
/*
* Copyright 2016 Sven Verdoolaege
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege.
*/
#include <isl/ctx.h>
#include <isl/id.h>
#include <isl/val.h>
#include <isl/space.h>
#include <isl/aff.h>
#include <isl/set.h>
#include <isl/map.h>
#include <isl/union_set.h>
#include <isl/union_map.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include "ppcg.h"
/* Internal data structure for use during the detection of statements
* that can be grouped.
*
* "sc" contains the original schedule constraints (not a copy).
* "dep" contains the intersection of the validity and the proximity
* constraints in "sc". It may be NULL if it has not been computed yet.
* "group_id" is the identifier for the next group that is extracted.
*
* "domain" is the set of statement instances that belong to any of the groups.
* "contraction" maps the elements of "domain" to the corresponding group
* instances.
* "schedule" schedules the statements in each group relatively to each other.
* These last three fields are NULL if no groups have been found so far.
*/
struct ppcg_grouping {
isl_schedule_constraints *sc;
isl_union_map *dep;
int group_id;
isl_union_set *domain;
isl_union_pw_multi_aff *contraction;
isl_schedule *schedule;
};
/* Clear all memory allocated by "grouping".
*/
static void ppcg_grouping_clear(struct ppcg_grouping *grouping)
{
isl_union_map_free(grouping->dep);
isl_union_set_free(grouping->domain);
isl_union_pw_multi_aff_free(grouping->contraction);
isl_schedule_free(grouping->schedule);
}
/* Compute the intersection of the proximity and validity dependences
* in grouping->sc and store the result in grouping->dep, unless
* this intersection has been computed before.
*/
static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping)
{
isl_union_map *validity, *proximity;
if (grouping->dep)
return isl_stat_ok;
validity = isl_schedule_constraints_get_validity(grouping->sc);
proximity = isl_schedule_constraints_get_proximity(grouping->sc);
grouping->dep = isl_union_map_intersect(validity, proximity);
if (!grouping->dep)
return isl_stat_error;
return isl_stat_ok;
}
/* Information extracted from one or more consecutive leaves
* in the input schedule.
*
* "list" contains the sets of statement instances in the leaves,
* one element in the list for each original leaf.
* "domain" contains the union of the sets in "list".
* "prefix" contains the prefix schedule of these elements.
*/
struct ppcg_grouping_leaf {
isl_union_set *domain;
isl_union_set_list *list;
isl_multi_union_pw_aff *prefix;
};
/* Free all memory allocated for "leaves".
*/
static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[])
{
int i;
if (!leaves)
return;
for (i = 0; i < n; ++i) {
isl_union_set_free(leaves[i].domain);
isl_union_set_list_free(leaves[i].list);
isl_multi_union_pw_aff_free(leaves[i].prefix);
}
free(leaves);
}
/* Short-hand for retrieving the prefix schedule at "node"
* in the form of an isl_multi_union_pw_aff.
*/
static __isl_give isl_multi_union_pw_aff *get_prefix(
__isl_keep isl_schedule_node *node)
{
return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
}
/* Return an array of "n" elements with information extracted from
* the "n" children of "node" starting at "first", all of which
* are known to be filtered leaves.
*/
struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node,
int first, int n)
{
int i;
isl_ctx *ctx;
struct ppcg_grouping_leaf *leaves;
if (!node)
return NULL;
ctx = isl_schedule_node_get_ctx(node);
leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n);
if (!leaves)
return NULL;
for (i = 0; i < n; ++i) {
isl_schedule_node *child;
isl_union_set *domain;
child = isl_schedule_node_get_child(node, first + i);
child = isl_schedule_node_child(child, 0);
domain = isl_schedule_node_get_domain(child);
leaves[i].domain = isl_union_set_copy(domain);
leaves[i].list = isl_union_set_list_from_union_set(domain);
leaves[i].prefix = get_prefix(child);
isl_schedule_node_free(child);
}
return leaves;
}
/* Internal data structure used by merge_leaves.
*
* "src" and "dst" point to the two consecutive leaves that are
* under investigation for being merged.
* "merge" is initially set to 0 and is set to 1 as soon as
* it turns out that it is useful to merge the two leaves.
*/
struct ppcg_merge_leaves_data {
int merge;
struct ppcg_grouping_leaf *src;
struct ppcg_grouping_leaf *dst;
};
/* Given a relation "map" between instances of two statements A and B,
* does it relate every instance of A (according to the domain of "src")
* to every instance of B (according to the domain of "dst")?
*/
static isl_bool covers_src_and_dst(__isl_keep isl_map *map,
struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
{
isl_space *space;
isl_set *set1, *set2;
isl_bool is_subset;
space = isl_space_domain(isl_map_get_space(map));
set1 = isl_union_set_extract_set(src->domain, space);
set2 = isl_map_domain(isl_map_copy(map));
is_subset = isl_set_is_subset(set1, set2);
isl_set_free(set1);
isl_set_free(set2);
if (is_subset < 0 || !is_subset)
return is_subset;
space = isl_space_range(isl_map_get_space(map));
set1 = isl_union_set_extract_set(dst->domain, space);
set2 = isl_map_range(isl_map_copy(map));
is_subset = isl_set_is_subset(set1, set2);
isl_set_free(set1);
isl_set_free(set2);
return is_subset;
}
/* Given a relation "map" between instances of two statements A and B,
* are pairs of related instances executed together in the input schedule?
* That is, is each pair of instances assigned the same value
* by the corresponding prefix schedules?
*
* In particular, select the subset of "map" that has pairs of elements
* with the same value for the prefix schedules and then check
* if "map" is still a subset of the result.
*/
static isl_bool matches_prefix(__isl_keep isl_map *map,
struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
{
isl_union_map *umap, *equal;
isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix;
isl_bool is_subset;
src_prefix = isl_multi_union_pw_aff_copy(src->prefix);
dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix);
prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix);
umap = isl_union_map_from_map(isl_map_copy(map));
equal = isl_union_map_copy(umap);
equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix);
is_subset = isl_union_map_is_subset(umap, equal);
isl_union_map_free(umap);
isl_union_map_free(equal);
return is_subset;
}
/* Given a set of validity and proximity schedule constraints "map"
* between statements in consecutive leaves in a valid schedule,
* should the two leaves be merged into one?
*
* In particular, the two are merged if the constraints form
* a bijection between every instance of the first statement and
* every instance of the second statement. Moreover, each
* pair of such dependent instances needs to be executed consecutively
* in the input schedule. That is, they need to be assigned
* the same value by their prefix schedules.
*
* What this means is that for each instance of the first statement
* there is exactly one instance of the second statement that
* is executed immediately after the instance of the first statement and
* that, moreover, both depends on this statement instance and
* should be brought as close as possible to this statement instance.
* In other words, it is both possible to execute the two instances
* together (according to the input schedule) and desirable to do so
* (according to the validity and proximity schedule constraints).
*/
static isl_stat check_merge(__isl_take isl_map *map, void *user)
{
struct ppcg_merge_leaves_data *data = user;
isl_bool ok;
ok = covers_src_and_dst(map, data->src, data->dst);
if (ok >= 0 && ok)
ok = isl_map_is_bijective(map);
if (ok >= 0 && ok)
ok = matches_prefix(map, data->src, data->dst);
isl_map_free(map);
if (ok < 0)
return isl_stat_error;
if (!ok)
return isl_stat_ok;
data->merge = 1;
return isl_stat_error;
}
/* Merge the leaves at position "pos" and "pos + 1" in "leaves".
*/
static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[], int pos)
{
int i;
leaves[pos].domain = isl_union_set_union(leaves[pos].domain,
leaves[pos + 1].domain);
leaves[pos].list = isl_union_set_list_concat(leaves[pos].list,
leaves[pos + 1].list);
leaves[pos].prefix = isl_multi_union_pw_aff_union_add(
leaves[pos].prefix, leaves[pos + 1].prefix);
for (i = pos + 1; i + 1 < n; ++i)
leaves[i] = leaves[i + 1];
leaves[n - 1].domain = NULL;
leaves[n - 1].list = NULL;
leaves[n - 1].prefix = NULL;
if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix)
return isl_stat_error;
return isl_stat_ok;
}
/* Merge pairs of consecutive leaves in "leaves" taking into account
* the intersection of validity and proximity schedule constraints "dep".
*
* If a leaf has been merged with the next leaf, then the combination
* is checked again for merging with the next leaf.
* That is, if the leaves are A, B and C, then B may not have been
* merged with C, but after merging A and B, it could still be useful
* to merge the combination AB with C.
*
* Two leaves A and B are merged if there are instances of at least
* one pair of statements, one statement in A and one B, such that
* the validity and proximity schedule constraints between them
* make them suitable for merging according to check_merge.
*
* Return the final number of leaves in the sequence, or -1 on error.
*/
static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[],
__isl_keep isl_union_map *dep)
{
int i;
struct ppcg_merge_leaves_data data;
for (i = n - 1; i >= 0; --i) {
isl_union_map *dep_i;
isl_stat ok;
if (i + 1 >= n)
continue;
dep_i = isl_union_map_copy(dep);
dep_i = isl_union_map_intersect_domain(dep_i,
isl_union_set_copy(leaves[i].domain));
dep_i = isl_union_map_intersect_range(dep_i,
isl_union_set_copy(leaves[i + 1].domain));
data.merge = 0;
data.src = &leaves[i];
data.dst = &leaves[i + 1];
ok = isl_union_map_foreach_map(dep_i, &check_merge, &data);
isl_union_map_free(dep_i);
if (ok < 0 && !data.merge)
return -1;
if (!data.merge)
continue;
if (merge_pair(n, leaves, i) < 0)
return -1;
--n;
++i;
}
return n;
}
/* Construct a schedule with "domain" as domain, that executes
* the elements of "list" in order (as a sequence).
*/
static __isl_give isl_schedule *schedule_from_domain_and_list(
__isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list)
{
isl_schedule *schedule;
isl_schedule_node *node;
schedule = isl_schedule_from_domain(isl_union_set_copy(domain));
node = isl_schedule_get_root(schedule);
isl_schedule_free(schedule);
node = isl_schedule_node_child(node, 0);
list = isl_union_set_list_copy(list);
node = isl_schedule_node_insert_sequence(node, list);
schedule = isl_schedule_node_get_schedule(node);
isl_schedule_node_free(node);
return schedule;
}
/* Construct a unique identifier for a group in "grouping".
*
* The name is of the form G_n, with n the first value starting at
* grouping->group_id that does not result in an identifier
* that is already in use in the domain of the original schedule
* constraints.
*/
static isl_id *construct_group_id(struct ppcg_grouping *grouping,
__isl_take isl_space *space)
{
isl_ctx *ctx;
isl_id *id;
isl_bool empty;
isl_union_set *domain;
if (!space)
return NULL;
ctx = isl_space_get_ctx(space);
domain = isl_schedule_constraints_get_domain(grouping->sc);
do {
char buffer[20];
isl_id *id;
isl_set *set;
snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id);
grouping->group_id++;
id = isl_id_alloc(ctx, buffer, NULL);
space = isl_space_set_tuple_id(space, isl_dim_set, id);
set = isl_union_set_extract_set(domain, isl_space_copy(space));
empty = isl_set_plain_is_empty(set);
isl_set_free(set);
} while (empty >= 0 && !empty);
if (empty < 0)
space = isl_space_free(space);
id = isl_space_get_tuple_id(space, isl_dim_set);
isl_space_free(space);
isl_union_set_free(domain);
return id;
}
/* Construct a contraction from "prefix" and "domain" for a new group
* in "grouping".
*
* The values of the prefix schedule "prefix" are used as instances
* of the new group. The identifier of the group is constructed
* in such a way that it does not conflict with those of earlier
* groups nor with statements in the domain of the original
* schedule constraints.
* The isl_multi_union_pw_aff "prefix" then simply needs to be
* converted to an isl_union_pw_multi_aff. However, this is not
* possible if "prefix" is zero-dimensional, so in this case,
* a contraction is constructed from "domain" instead.
*/
static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain(
struct ppcg_grouping *grouping,
__isl_keep isl_multi_union_pw_aff *prefix,
__isl_keep isl_union_set *domain)
{
isl_id *id;
isl_space *space;
int dim;
space = isl_multi_union_pw_aff_get_space(prefix);
if (!space)
return NULL;
dim = isl_space_dim(space, isl_dim_set);
id = construct_group_id(grouping, space);
if (dim == 0) {
isl_multi_val *mv;
space = isl_multi_union_pw_aff_get_space(prefix);
space = isl_space_set_tuple_id(space, isl_dim_set, id);
mv = isl_multi_val_zero(space);
domain = isl_union_set_copy(domain);
return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv);
}
prefix = isl_multi_union_pw_aff_copy(prefix);
prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id);
return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix);
}
/* Extend "grouping" with groups corresponding to merged
* leaves in the list of potentially merged leaves "leaves".
*
* The "list" field of each element in "leaves" contains a list
* of the instances sets of the original leaves that have been
* merged into this element. If at least two of the original leaves
* have been merged into a given element, then add the corresponding
* group to "grouping".
* In particular, the domain is extended with the statement instances
* of the merged leaves, the contraction is extended with a mapping
* of these statement instances to instances of a new group and
* the schedule is extended with a schedule that executes
* the statement instances according to the order of the leaves
* in which they appear.
* Since the instances of the groups should already be scheduled apart
* in the schedule into which this schedule will be plugged in,
* the schedules of the individual groups are combined independently
* of each other (as a set).
*/
static isl_stat add_groups(struct ppcg_grouping *grouping,
int n, struct ppcg_grouping_leaf leaves[])
{
int i;
for (i = 0; i < n; ++i) {
int n_leaf;
isl_schedule *schedule;
isl_union_set *domain;
isl_union_pw_multi_aff *upma;
n_leaf = isl_union_set_list_n_union_set(leaves[i].list);
if (n_leaf < 0)
return isl_stat_error;
if (n_leaf <= 1)
continue;
schedule = schedule_from_domain_and_list(leaves[i].domain,
leaves[i].list);
upma = group_contraction_from_prefix_and_domain(grouping,
leaves[i].prefix, leaves[i].domain);
domain = isl_union_set_copy(leaves[i].domain);
if (grouping->domain) {
domain = isl_union_set_union(domain, grouping->domain);
upma = isl_union_pw_multi_aff_union_add(upma,
grouping->contraction);
schedule = isl_schedule_set(schedule,
grouping->schedule);
}
grouping->domain = domain;
grouping->contraction = upma;
grouping->schedule = schedule;
if (!grouping->domain || !grouping->contraction ||
!grouping->schedule)
return isl_stat_error;
}
return isl_stat_ok;
}
/* Look for any pairs of consecutive leaves among the "n" children of "node"
* starting at "first" that should be merged together.
* Store the results in "grouping".
*
* First make sure the intersection of validity and proximity
* schedule constraints is available and extract the required
* information from the "n" leaves.
* Then try and merge consecutive leaves based on the validity
* and proximity constraints.
* If any pairs were successfully merged, then add groups
* corresponding to the merged leaves to "grouping".
*/
static isl_stat group_subsequence(__isl_keep isl_schedule_node *node,
int first, int n, struct ppcg_grouping *grouping)
{
int n_merge;
struct ppcg_grouping_leaf *leaves;
if (ppcg_grouping_compute_dep(grouping) < 0)
return isl_stat_error;
leaves = extract_leaves(node, first, n);
if (!leaves)
return isl_stat_error;
n_merge = merge_leaves(n, leaves, grouping->dep);
if (n_merge >= 0 && n_merge < n &&
add_groups(grouping, n_merge, leaves) < 0)
return isl_stat_error;
ppcg_grouping_leaf_free(n, leaves);
return isl_stat_ok;
}
/* If "node" is a sequence, then check if it has any consecutive
* leaves that should be merged together and store the results
* in "grouping".
*
* In particular, call group_subsequence on each consecutive
* sequence of (filtered) leaves among the children of "node".
*/
static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user)
{
int i, n, first;
struct ppcg_grouping *grouping = user;
if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
return isl_bool_true;
n = isl_schedule_node_n_children(node);
if (n < 0)
return isl_bool_error;
first = -1;
for (i = 0; i < n; ++i) {
isl_schedule_node *child;
enum isl_schedule_node_type type;
child = isl_schedule_node_get_child(node, i);
child = isl_schedule_node_child(child, 0);
type = isl_schedule_node_get_type(child);
isl_schedule_node_free(child);
if (first >= 0 && type != isl_schedule_node_leaf) {
if (group_subsequence(node, first, i - first,
grouping) < 0)
return isl_bool_error;
first = -1;
}
if (first < 0 && type == isl_schedule_node_leaf)
first = i;
}
if (first >= 0) {
if (group_subsequence(node, first, n - first, grouping) < 0)
return isl_bool_error;
}
return isl_bool_true;
}
/* Complete "grouping" to cover all statement instances in the domain
* of grouping->sc.
*
* In particular, grouping->domain is set to the full set of statement
* instances; group->contraction is extended with an identity
* contraction on the additional instances and group->schedule
* is extended with an independent schedule on those additional instances.
* In the extension of group->contraction, the additional instances
* are split into those belong to different statements and those
* that belong to some of the same statements. The first group
* is replaced by its universe in order to simplify the contraction extension.
*/
static void complete_grouping(struct ppcg_grouping *grouping)
{
isl_union_set *domain, *left, *overlap;
isl_union_pw_multi_aff *upma;
isl_schedule *schedule;
domain = isl_schedule_constraints_get_domain(grouping->sc);
left = isl_union_set_subtract(isl_union_set_copy(domain),
isl_union_set_copy(grouping->domain));
schedule = isl_schedule_from_domain(isl_union_set_copy(left));
schedule = isl_schedule_set(schedule, grouping->schedule);
grouping->schedule = schedule;
overlap = isl_union_set_universe(grouping->domain);
grouping->domain = domain;
overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap);
left = isl_union_set_subtract(left, isl_union_set_copy(overlap));
left = isl_union_set_universe(left);
left = isl_union_set_union(left, overlap);
upma = isl_union_set_identity_union_pw_multi_aff(left);
upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction);
grouping->contraction = upma;
}
/* Compute a schedule on the domain of "sc" that respects the schedule
* constraints in "sc".
*
* "schedule" is a known correct schedule that is used to combine
* groups of statements if options->group_chains is set.
* In particular, statements that are executed consecutively in a sequence
* in this schedule and where all instances of the second depend on
* the instance of the first that is executed in the same iteration
* of outer band nodes are grouped together into a single statement.
* The schedule constraints are then mapped to these groups of statements
* and the resulting schedule is expanded again to refer to the original
* statements.
*/
__isl_give isl_schedule *ppcg_compute_schedule(
__isl_take isl_schedule_constraints *sc,
__isl_keep isl_schedule *schedule, struct ppcg_options *options)
{
struct ppcg_grouping grouping = { sc };
isl_union_pw_multi_aff *contraction;
isl_union_map *umap;
isl_schedule *res, *expansion;
if (!options->group_chains)
return isl_schedule_constraints_compute_schedule(sc);
grouping.group_id = 0;
if (isl_schedule_foreach_schedule_node_top_down(schedule,
&detect_groups, &grouping) < 0)
goto error;
if (!grouping.contraction) {
ppcg_grouping_clear(&grouping);
return isl_schedule_constraints_compute_schedule(sc);
}
complete_grouping(&grouping);
contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
umap = isl_union_map_from_union_pw_multi_aff(contraction);
sc = isl_schedule_constraints_apply(sc, umap);
res = isl_schedule_constraints_compute_schedule(sc);
contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
expansion = isl_schedule_copy(grouping.schedule);
res = isl_schedule_expand(res, contraction, expansion);
ppcg_grouping_clear(&grouping);
return res;
error:
ppcg_grouping_clear(&grouping);
isl_schedule_constraints_free(sc);
return NULL;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,41 +0,0 @@
#ifndef HYBRID_H
#define HYBRID_H
#include <isl/val.h>
#include <isl/schedule_node.h>
#include "ppcg.h"
struct ppcg_ht_bounds;
typedef struct ppcg_ht_bounds ppcg_ht_bounds;
struct ppcg_ht_phase;
typedef struct ppcg_ht_phase ppcg_ht_phase;
isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node);
isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node);
__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
__isl_keep isl_schedule_node *node);
void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds);
isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds);
isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
__isl_keep isl_multi_val *sizes);
__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
__isl_take isl_schedule_node *node, struct ppcg_options *options);
__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
__isl_take ppcg_ht_bounds *bounds);
__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
__isl_keep isl_schedule_node *node);
__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node);
__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
__isl_take isl_schedule_node *node,
__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
void *user), void *user);
__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
__isl_take isl_schedule_node *node);
#endif

View File

@ -1,174 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include "ocl_utilities.h"
/* Return the OpenCL error string for a given error number.
*/
const char *opencl_error_string(cl_int error)
{
int errorCount;
int index;
static const char *errorString[] = {
[CL_SUCCESS] = "CL_SUCCESS",
[-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
[-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
[-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
[-CL_MEM_OBJECT_ALLOCATION_FAILURE] =
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
[-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
[-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
[-CL_PROFILING_INFO_NOT_AVAILABLE] =
"CL_PROFILING_INFO_NOT_AVAILABLE",
[-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
[-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
[-CL_IMAGE_FORMAT_NOT_SUPPORTED] =
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
[-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
[-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
[-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
[-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
[-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
[-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
[-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
[-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
[-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
[-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
[-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT",
[-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] =
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
[-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
[-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
[-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
[-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
[-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
[-CL_INVALID_PROGRAM_EXECUTABLE] =
"CL_INVALID_PROGRAM_EXECUTABLE",
[-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
[-CL_INVALID_KERNEL_DEFINITION] =
"CL_INVALID_KERNEL_DEFINITION",
[-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
[-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
[-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
[-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
[-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
[-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
[-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
[-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
[-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
[-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
[-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
[-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
[-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
[-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
[-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
[-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
[-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
};
errorCount = sizeof(errorString) / sizeof(errorString[0]);
index = -error;
return (index >= 0 && index < errorCount) ?
errorString[index] : "Unspecified Error";
}
/* Find a GPU or a CPU associated with the first available platform.
* If use_gpu is set, then this function first tries to look for a GPU
* in the first available platform.
* If this fails or if use_gpu is not set, then it tries to use the CPU.
*/
cl_device_id opencl_create_device(int use_gpu)
{
cl_platform_id platform;
cl_device_id dev;
int err;
err = clGetPlatformIDs(1, &platform, NULL);
if (err < 0) {
fprintf(stderr, "Error %s while looking for a platform.\n",
opencl_error_string(err));
exit(1);
}
err = CL_DEVICE_NOT_FOUND;
if (use_gpu)
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev,
NULL);
if (err == CL_DEVICE_NOT_FOUND)
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev,
NULL);
if (err < 0) {
fprintf(stderr, "Error %s while looking for a device.\n",
opencl_error_string(err));
exit(1);
}
return dev;
}
/* Create an OpenCL program from a string and compile it.
*/
cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
const char *program_source, size_t program_size,
const char *opencl_options)
{
int err;
cl_program program;
char *program_log;
size_t log_size;
program = clCreateProgramWithSource(ctx, 1,
&program_source, &program_size, &err);
if (err < 0) {
fprintf(stderr, "Could not create the program\n");
exit(1);
}
err = clBuildProgram(program, 0, NULL, opencl_options, NULL, NULL);
if (err < 0) {
fprintf(stderr, "Could not build the program.\n");
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0,
NULL, &log_size);
program_log = (char *) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
fprintf(stderr, "%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
/* Create an OpenCL program from a source file and compile it.
*/
cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
const char* filename, const char* opencl_options)
{
cl_program program;
FILE *program_file;
char *program_source;
size_t program_size, read;
program_file = fopen(filename, "r");
if (program_file == NULL) {
fprintf(stderr, "Could not find the source file.\n");
exit(1);
}
fseek(program_file, 0, SEEK_END);
program_size = ftell(program_file);
rewind(program_file);
program_source = (char *) malloc(program_size + 1);
program_source[program_size] = '\0';
read = fread(program_source, sizeof(char), program_size, program_file);
if (read != program_size) {
fprintf(stderr, "Error while reading the kernel.\n");
exit(1);
}
fclose(program_file);
program = opencl_build_program_from_string(ctx, dev, program_source,
program_size, opencl_options);
free(program_source);
return program;
}

View File

@ -1,32 +0,0 @@
#ifndef OCL_UTILITIES_H
#define OCL_UTILITIES_H
#if defined(__APPLE__)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
/* Return the OpenCL error string for a given error number.
*/
const char *opencl_error_string(cl_int error);
/* Find a GPU or a CPU associated with the first available platform.
* If use_gpu is set, then this function first tries to look for a GPU
* in the first available platform.
* If this fails or if use_gpu is not set, then it tries to use the CPU.
*/
cl_device_id opencl_create_device(int use_gpu);
/* Create an OpenCL program from a string and compile it.
*/
cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
const char *program_source, size_t program_size,
const char *opencl_options);
/* Create an OpenCL program from a source file and compile it.
*/
cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
const char* filename, const char* opencl_options);
#endif

View File

@ -1,11 +0,0 @@
#ifndef _OPENCL_H
#define _OPENCL_H
#include <pet.h>
#include "ppcg_options.h"
#include "ppcg.h"
int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
const char *input, const char *output);
#endif

View File

@ -1,78 +0,0 @@
#!/bin/sh
keep=no
for option; do
case "$option" in
--keep)
keep=yes
;;
esac
done
EXEEXT=@EXEEXT@
VERSION=@GIT_HEAD_VERSION@
CC="@CC@"
CFLAGS="--std=gnu99"
srcdir="@srcdir@"
if [ $keep = "yes" ]; then
OUTDIR="opencl_test.$VERSION"
mkdir "$OUTDIR" || exit 1
else
if test "x$TMPDIR" = "x"; then
TMPDIR=/tmp
fi
OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
fi
run_tests () {
subdir=$1
ppcg_options=$2
echo Test with PPCG options \'$ppcg_options\'
mkdir ${OUTDIR}/${subdir} || exit 1
for i in $srcdir/tests/*.c; do
echo $i
name=`basename $i`
name="${name%.c}"
out_c="${OUTDIR}/${subdir}/$name.ppcg.c"
out="${OUTDIR}/${subdir}/$name.ppcg$EXEEXT"
options="--target=opencl --opencl-no-use-gpu $ppcg_options"
functions="$srcdir/tests/${name}_opencl_functions.cl"
if test -f $functions; then
options="$options --opencl-include-file=$functions"
options="$options --opencl-compiler-options=-I."
fi
./ppcg$EXEEXT $options $i -o "$out_c" || exit
$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
-I. "$out_c" -o "$out" || exit
$out || exit
done
}
run_tests default
run_tests embed --opencl-embed-kernel-code
for i in $srcdir/examples/*.c; do
echo $i
name=`basename $i`
name="${name%.c}"
exe_ref="${OUTDIR}/$name.ref$EXEEXT"
gen_ocl="${OUTDIR}/$name.ppcg.c"
exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT"
output_ref="${OUTDIR}/$name.ref.out"
output_ocl="${OUTDIR}/$name.ppcg.out"
$CC $CFLAGS $i -o $exe_ref || exit
./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \
exit
$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
"$gen_ocl" -o "$exe_ocl" || exit
$exe_ref > $output_ref || exit
$exe_ocl > $output_ocl || exit
cmp $output_ref $output_ocl || exit
done
if [ $keep = "no" ]; then
rm -r "${OUTDIR}"
fi

View File

@ -1,109 +0,0 @@
#!/bin/sh
keep=no
verbose=no
for option; do
case "$option" in
--keep)
keep=yes
;;
--verbose)
verbose=yes
;;
esac
done
EXEEXT=@EXEEXT@
DIR=@POLYBENCH_DIR@
VERSION=@GIT_HEAD_VERSION@
SIZE=-DMINI_DATASET
CC="@CC@"
HAVE_OPENCL=@HAVE_OPENCL@
HAVE_OPENMP=@HAVE_OPENMP@
srcdir="@srcdir@"
if [ $keep = "yes" ]; then
OUTDIR="out.$VERSION"
mkdir "$OUTDIR" || exit 1
else
if test "x$TMPDIR" = "x"; then
TMPDIR=/tmp
fi
OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
fi
CPPFLAGS="-DPOLYBENCH_USE_C99_PROTO -DPOLYBENCH_DUMP_ARRAYS"
CPPFLAGS="$CPPFLAGS $SIZE -I $DIR/utilities"
CFLAGS="-lm --std=gnu99"
echo "Running tests in folder ${OUTDIR}"
run_tests () {
ext=$1
ppcg_options=$2
cc_options=$3
if [ "x$ppcg_options" = "x" ]; then
ppcg_option_str="none"
else
ppcg_option_str=$ppcg_options
fi
if [ "x$cc_options" = "x" ]; then
cc_option_str="none"
else
cc_option_str=$cc_options
fi
echo Test: $ext, ppcg options: $ppcg_option_str, CC options: $cc_option_str
for i in `cat $DIR/utilities/benchmark_list`; do
echo $i
name=`basename $i`
name=${name%.c}
source_opt="${OUTDIR}/$name.$ext.c"
prog_orig=${OUTDIR}/$name.orig${EXEEXT}
prog_opt=${OUTDIR}/$name.$ext${EXEEXT}
output_orig=${OUTDIR}/$name.orig.out
output_opt=${OUTDIR}/$name.$ext.out
dir=`dirname $i`
if [ $verbose = "yes" ]; then
echo ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i \
$CPPFLAGS -o $source_opt $ppcg_options
fi
./ppcg$EXEEXT -I $DIR/$dir $DIR/$i $CPPFLAGS \
-o $source_opt $ppcg_options || exit
$CC -I $DIR/$dir $CPPFLAGS $DIR/$i -o $prog_orig \
$DIR/utilities/polybench.c $CFLAGS
$prog_orig 2> $output_orig
if [ $verbose = "yes" ]; then
echo $CC -I $DIR/$dir $CPPFLAGS $source_opt \
-o $prog_opt $DIR/utilities/polybench.c \
$CFLAGS $cc_options
fi
$CC -I $DIR/$dir $CPPFLAGS $source_opt -o $prog_opt \
$DIR/utilities/polybench.c $CFLAGS $cc_options || exit
$prog_opt 2> $output_opt
cmp $output_orig $output_opt || exit
done
}
run_tests ppcg "--target=c --tile"
run_tests ppcg_live "--target=c --no-live-range-reordering --tile"
# Test OpenMP code, if compiler supports openmp
if [ $HAVE_OPENMP = "yes" ]; then
run_tests ppcg_omp "--target=c --openmp" -fopenmp
echo Introduced `grep -R 'omp parallel' "${OUTDIR}" | wc -l` '"pragma omp parallel for"'
else
echo Compiler does not support OpenMP. Skipping OpenMP tests.
fi
if [ $HAVE_OPENCL = "yes" ]; then
run_tests ppcg_opencl "--target=opencl --opencl-no-use-gpu" \
"-I $srcdir $srcdir/ocl_utilities.c -lOpenCL"
fi
if [ $keep = "no" ]; then
rm -r "${OUTDIR}"
fi

File diff suppressed because it is too large Load Diff

View File

@ -1,128 +0,0 @@
#ifndef PPCG_H
#define PPCG_H
#include <isl/schedule.h>
#include <isl/set.h>
#include <isl/union_set.h>
#include <isl/union_map.h>
#include <isl/id_to_ast_expr.h>
#include <pet.h>
#include "ppcg_options.h"
const char *ppcg_base_name(const char *filename);
int ppcg_extract_base_name(char *name, const char *input);
/* Representation of the scop for use inside PPCG.
*
* "options" are the options specified by the user.
* Some fields in this structure may depend on some of the options.
*
* "start" and "end" are file offsets of the corresponding program text.
* "context" represents constraints on the parameters.
* "domain" is the union of all iteration domains.
* "call" contains the iteration domains of statements with a call expression.
* "reads" contains all potential read accesses.
* "tagged_reads" is the same as "reads", except that the domain is a wrapped
* relation mapping an iteration domain to a reference identifier
* "live_in" contains the potential read accesses that potentially
* have no corresponding writes in the scop.
* "may_writes" contains all potential write accesses.
* "tagged_may_writes" is the same as "may_writes", except that the domain
* is a wrapped relation mapping an iteration domain
* to a reference identifier
* "must_writes" contains all definite write accesses.
* "tagged_must_writes" is the same as "must_writes", except that the domain
* is a wrapped relation mapping an iteration domain
* to a reference identifier
* "live_out" contains the potential write accesses that are potentially
* not killed by any kills or any other writes.
* "must_kills" contains all definite kill accesses.
* "tagged_must_kills" is the same as "must_kills", except that the domain
* is a wrapped relation mapping an iteration domain
* to a reference identifier.
*
* "tagger" maps tagged iteration domains to the corresponding untagged
* iteration domain.
*
* "independence" is the union of all independence filters.
*
* "dep_flow" represents the potential flow dependences.
* "tagged_dep_flow" is the same as "dep_flow", except that both domain and
* range are wrapped relations mapping an iteration domain to
* a reference identifier. May be NULL if not computed.
* "dep_false" represents the potential false (anti and output) dependences.
* "dep_forced" represents the validity constraints that should be enforced
* even when live-range reordering is used.
* In particular, these constraints ensure that all live-in
* accesses remain live-in and that all live-out accesses remain live-out
* and that multiple potential sources for the same read are
* executed in the original order.
* "dep_order"/"tagged_dep_order" represents the order dependences between
* the live range intervals in "dep_flow"/"tagged_dep_flow".
* It is only used if the live_range_reordering
* option is set. Otherwise it is NULL.
* If "dep_order" is used, then "dep_false" only contains a limited
* set of anti and output dependences.
* "schedule" represents the (original) schedule.
*
* "names" contains all variable names that are in use by the scop.
* The names are mapped to a dummy value.
*
* "pet" is the original pet_scop.
*/
struct ppcg_scop {
struct ppcg_options *options;
unsigned start;
unsigned end;
isl_set *context;
isl_union_set *domain;
isl_union_set *call;
isl_union_map *tagged_reads;
isl_union_map *reads;
isl_union_map *live_in;
isl_union_map *tagged_may_writes;
isl_union_map *may_writes;
isl_union_map *tagged_must_writes;
isl_union_map *must_writes;
isl_union_map *live_out;
isl_union_map *tagged_must_kills;
isl_union_map *must_kills;
isl_union_pw_multi_aff *tagger;
isl_union_map *independence;
isl_union_map *dep_flow;
isl_union_map *tagged_dep_flow;
isl_union_map *dep_false;
isl_union_map *dep_forced;
isl_union_map *dep_order;
isl_union_map *tagged_dep_order;
isl_schedule *schedule;
isl_id_to_ast_expr *names;
struct pet_scop *pet;
};
int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop);
__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop,
int n, const char *prefix);
int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out,
struct ppcg_options *options,
__isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
struct ppcg_scop *scop, void *user), void *user);
__isl_give isl_schedule *ppcg_compute_schedule(
__isl_take isl_schedule_constraints *sc,
__isl_keep isl_schedule *schedule, struct ppcg_options *options);
void compute_tagger(struct ppcg_scop *ps);
void compute_dependences(struct ppcg_scop *scop);
void eliminate_dead_code(struct ppcg_scop *ps);
void *ppcg_scop_free(struct ppcg_scop *ps);
#endif

View File

@ -1,136 +0,0 @@
/*
* Copyright 2010-2011 INRIA Saclay
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
* Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
* 91893 Orsay, France
*/
#include "ppcg_options.h"
static struct isl_arg_choice target[] = {
{"c", PPCG_TARGET_C},
{"cuda", PPCG_TARGET_CUDA},
{"opencl", PPCG_TARGET_OPENCL},
{0}
};
/* Set defaults that depend on the target.
* In particular, set --schedule-outer-coincidence iff target is a GPU.
*/
void ppcg_options_set_target_defaults(struct ppcg_options *options)
{
char *argv[2] = { NULL };
argv[0] = "ppcg_options_set_target_defaults";
if (options->target == PPCG_TARGET_C)
argv[1] = "--no-schedule-outer-coincidence";
else
argv[1] = "--schedule-outer-coincidence";
isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL);
}
/* Callback that is called whenever the "target" option is set (to "val").
* The callback is called after target has been updated.
*
* Call ppcg_options_set_target_defaults to reset the target-dependent options.
*/
static int set_target(void *opt, unsigned val)
{
struct ppcg_options *options = opt;
ppcg_options_set_target_defaults(options);
return 0;
}
ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args)
ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0,
"dump-schedule-constraints", 0, "dump schedule constraints")
ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule, 0,
"dump-schedule", 0, "dump isl computed schedule")
ISL_ARG_BOOL(struct ppcg_debug_options, dump_final_schedule, 0,
"dump-final-schedule", 0, "dump PPCG computed schedule")
ISL_ARG_BOOL(struct ppcg_debug_options, dump_sizes, 0,
"dump-sizes", 0,
"dump effectively used per kernel tile, grid and block sizes")
ISL_ARG_BOOL(struct ppcg_debug_options, verbose, 'v', "verbose", 0, NULL)
ISL_ARGS_END
ISL_ARGS_START(struct ppcg_options, ppcg_opencl_options_args)
ISL_ARG_STR(struct ppcg_options, opencl_compiler_options, 0, "compiler-options",
"options", NULL, "options to pass to the OpenCL compiler")
ISL_ARG_BOOL(struct ppcg_options, opencl_use_gpu, 0, "use-gpu", 1,
"use GPU device (if available)")
ISL_ARG_STR_LIST(struct ppcg_options, opencl_n_include_file,
opencl_include_files, 0, "include-file", "filename",
"file to #include in generated OpenCL code")
ISL_ARG_BOOL(struct ppcg_options, opencl_print_kernel_types, 0,
"print-kernel-types", 1,
"print definitions of types in the kernel file")
ISL_ARG_BOOL(struct ppcg_options, opencl_embed_kernel_code, 0,
"embed-kernel-code", 0, "embed kernel code into host code")
ISL_ARGS_END
ISL_ARGS_START(struct ppcg_options, ppcg_options_args)
ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options")
ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args,
"debugging options")
ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1,
"group chains of interdependent statements that are executed "
"consecutively in the original schedule before scheduling")
ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1,
"replace original schedule by isl computed schedule")
ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0,
"scale-tile-loops", 1, NULL)
ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL)
ISL_ARG_BOOL(struct ppcg_options, use_shared_memory, 0, "shared-memory", 1,
"use shared memory in kernel code")
ISL_ARG_BOOL(struct ppcg_options, use_private_memory, 0, "private-memory", 1,
"use private memory in kernel code")
ISL_ARG_STR(struct ppcg_options, ctx, 0, "ctx", "context", NULL,
"Constraints on parameters")
ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0,
"assume-non-negative-parameters", 0,
"assume all parameters are non-negative)")
ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0,
"perform tiling (C target)")
ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL)
ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles",
0, "isolate full tiles from partial tiles (hybrid tiling)")
ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL,
"Per kernel tile, grid and block sizes")
ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0,
"max-shared-memory", "size", 8192, "maximal amount of shared memory")
ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0,
"Generate OpenMP macros (only for C target)")
ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target,
&set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA,
"the target to generate code for")
ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0,
"linearize-device-arrays", 1,
"linearize all device arrays, even those of fixed size")
ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0,
"allow-gnu-extensions", 1,
"allow the use of GNU extensions in generated code")
ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0,
"live-range-reordering", 1,
"allow successive live ranges on the same memory element "
"to be reordered")
ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0,
"apply hybrid tiling whenever a suitable input pattern is found "
"(GPU targets)")
ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared",
0, "unroll code for copying to/from shared memory")
ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0,
"unroll code inside tile on GPU targets")
ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options")
ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule",
"file", NULL, "save isl computed schedule to <file>")
ISL_ARG_STR(struct ppcg_options, load_schedule_file, 0, "load-schedule",
"file", NULL, "load schedule from <file>, "
"using it instead of an isl computed schedule")
ISL_ARGS_END

View File

@ -1,100 +0,0 @@
#ifndef PPCG_OPTIONS_H
#define PPCG_OPTIONS_H
#include <isl/arg.h>
#include <isl/options.h>
struct ppcg_debug_options {
int dump_schedule_constraints;
int dump_schedule;
int dump_final_schedule;
int dump_sizes;
int verbose;
};
struct ppcg_options {
struct isl_options *isl;
struct ppcg_debug_options *debug;
/* Group chains of consecutive statements before scheduling. */
int group_chains;
/* Use isl to compute a schedule replacing the original schedule. */
int reschedule;
int scale_tile_loops;
int wrap;
/* Assume all parameters are non-negative. */
int non_negative_parameters;
char *ctx;
char *sizes;
/* Perform tiling (C target). */
int tile;
int tile_size;
/* Isolate full tiles from partial tiles. */
int isolate_full_tiles;
/* Take advantage of private memory. */
int use_private_memory;
/* Take advantage of shared memory. */
int use_shared_memory;
/* Maximal amount of shared memory. */
int max_shared_memory;
/* The target we generate code for. */
int target;
/* Generate OpenMP macros (C target only). */
int openmp;
/* Linearize all device arrays. */
int linearize_device_arrays;
/* Allow the use of GNU extensions in generated code. */
int allow_gnu_extensions;
/* Allow live range to be reordered. */
int live_range_reordering;
/* Allow hybrid tiling whenever a suitable input pattern is found. */
int hybrid;
/* Unroll the code for copying to/from shared memory. */
int unroll_copy_shared;
/* Unroll code inside tile on GPU targets. */
int unroll_gpu_tile;
/* Options to pass to the OpenCL compiler. */
char *opencl_compiler_options;
/* Prefer GPU device over CPU. */
int opencl_use_gpu;
/* Number of files to include. */
int opencl_n_include_file;
/* Files to include. */
const char **opencl_include_files;
/* Print definitions of types in kernels. */
int opencl_print_kernel_types;
/* Embed OpenCL kernel code in host code. */
int opencl_embed_kernel_code;
/* Name of file for saving isl computed schedule or NULL. */
char *save_schedule_file;
/* Name of file for loading schedule or NULL. */
char *load_schedule_file;
};
ISL_ARG_DECL(ppcg_debug_options, struct ppcg_debug_options,
ppcg_debug_options_args)
ISL_ARG_DECL(ppcg_options, struct ppcg_options, ppcg_options_args)
#define PPCG_TARGET_C 0
#define PPCG_TARGET_CUDA 1
#define PPCG_TARGET_OPENCL 2
void ppcg_options_set_target_defaults(struct ppcg_options *options);
#endif

View File

@ -1,461 +0,0 @@
/*
* Copyright 2012-2013 Ecole Normale Superieure
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege,
* Ecole Normale Superieure, 45 rue dUlm, 75230 Paris, France
*/
#include <isl/aff.h>
#include <isl/ast_build.h>
#include <isl/id.h>
#include "print.h"
#include "util.h"
__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p)
{
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "{");
p = isl_printer_end_line(p);
p = isl_printer_indent(p, 2);
return p;
}
__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p)
{
p = isl_printer_indent(p, -2);
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "}");
p = isl_printer_end_line(p);
return p;
}
/* Names of notes that keep track of whether min/max
* macro definitions have already been printed.
*/
static const char *ppcg_max_printed = "ppcg_max_printed";
static const char *ppcg_min_printed = "ppcg_min_printed";
/* Has the macro definition corresponding to "note_name" been printed
* to "p" before?
* That is, does "p" have an associated "note_name" note?
*/
static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name)
{
isl_ctx *ctx;
isl_id *id;
isl_bool printed;
if (!p)
return isl_bool_error;
ctx = isl_printer_get_ctx(p);
id = isl_id_alloc(ctx, note_name, NULL);
printed = isl_printer_has_note(p, id);
isl_id_free(id);
return printed;
}
/* Keep track of the fact that the macro definition corresponding
* to "note_name" has been printed to "p" by attaching a note with
* that name. The value of the note is of no importance, but it
* has to be a valid isl_id, so the note identifier is reused
* as the note.
*/
static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p,
const char *note_name)
{
isl_ctx *ctx;
isl_id *id;
if (!p)
return NULL;
ctx = isl_printer_get_ctx(p);
id = isl_id_alloc(ctx, note_name, NULL);
return isl_printer_set_note(p, id, isl_id_copy(id));
}
/* Print a macro definition "def" for the macro "name" to "p",
* unless such a macro definition has been printed to "p" before.
* "note_name" is used as the name of the note that keeps track
* of whether this printing has happened.
*/
static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p,
const char *name, const char *def, const char *note_name)
{
isl_bool printed;
printed = printed_before(p, note_name);
if (printed < 0)
return isl_printer_free(p);
if (printed)
return p;
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, "#define ");
p = isl_printer_print_str(p, name);
p = isl_printer_print_str(p, def);
p = isl_printer_end_line(p);
p = mark_printed(p, note_name);
return p;
}
/* Structure for keeping track of definitions of some macros.
*/
struct ppcg_macros {
const char *min;
const char *max;
};
/* Free the memory allocated by a struct ppcg_macros.
*/
static void ppcg_macros_free(void *user)
{
free(user);
}
/* Default macro definitions (when GNU extensions are allowed).
*/
struct ppcg_macros ppcg_macros_default = {
.min = "(x,y) "
"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
"_x < _y ? _x : _y; })",
.max = "(x,y) "
"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
"_x > _y ? _x : _y; })",
};
/* Name used for the note that keeps track of macro definitions.
*/
static const char *ppcg_macros = "ppcg_macros";
/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max
* to "min" and "max" and store them in "p".
*
* In particular, create a ppcg_macros object and attach it
* as a note to the printer.
*/
__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
const char *min, const char *max)
{
isl_ctx *ctx;
isl_id *id, *macros_id;
struct ppcg_macros *macros;
if (!p)
return NULL;
ctx = isl_printer_get_ctx(p);
macros = isl_alloc_type(ctx, struct ppcg_macros);
if (!macros)
return isl_printer_free(p);
macros->min = min;
macros->max = max;
id = isl_id_alloc(ctx, ppcg_macros, NULL);
macros_id = isl_id_alloc(ctx, NULL, macros);
if (!macros_id)
ppcg_macros_free(macros);
else
macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free);
p = isl_printer_set_note(p, id, macros_id);
return p;
}
/* Return the ppcg_macros object that holds the currently active
* macro definitions in "p".
* If "p" has a note with macro definitions, then return those.
* Otherwise, return the default macro definitions.
*/
static struct ppcg_macros *get_macros(__isl_keep isl_printer *p)
{
isl_id *id;
isl_bool has_macros;
struct ppcg_macros *macros;
id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL);
has_macros = isl_printer_has_note(p, id);
if (has_macros < 0 || !has_macros) {
isl_id_free(id);
if (has_macros < 0)
return NULL;
return &ppcg_macros_default;
}
id = isl_printer_get_note(p, id);
macros = isl_id_get_user(id);
isl_id_free(id);
return macros;
}
/* Print the currently active macro definition for ppcg_max.
*/
static __isl_give isl_printer *print_max(__isl_take isl_printer *p)
{
struct ppcg_macros *macros;
macros = get_macros(p);
if (!macros)
return isl_printer_free(p);
return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed);
}
/* Print the currently active macro definition for ppcg_min.
*/
static __isl_give isl_printer *print_min(__isl_take isl_printer *p)
{
struct ppcg_macros *macros;
macros = get_macros(p);
if (!macros)
return isl_printer_free(p);
return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed);
}
/* Print a macro definition for "type" to "p".
* If GNU extensions are allowed, then print a specialized definition
* for isl_ast_op_min and isl_ast_op_max.
* Otherwise, use the default isl definition.
*/
__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
__isl_take isl_printer *p)
{
isl_ctx *ctx;
struct ppcg_options *options;
if (!p)
return NULL;
ctx = isl_printer_get_ctx(p);
options = isl_ctx_peek_options(ctx, &ppcg_options_args);
if (!options || !options->allow_gnu_extensions)
return isl_ast_op_type_print_macro(type, p);
switch (type) {
case isl_ast_op_max:
return print_max(p);
case isl_ast_op_min:
return print_min(p);
default:
return isl_ast_op_type_print_macro(type, p);
}
}
/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type
* callback that prints a macro definition for "type".
*/
static isl_stat print_macro(enum isl_ast_op_type type, void *user)
{
isl_printer **p = user;
*p = ppcg_print_macro(type, *p);
if (!*p)
return isl_stat_error;
return isl_stat_ok;
}
/* Print the required macros for "expr".
*/
__isl_give isl_printer *ppcg_ast_expr_print_macros(
__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p)
{
if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0)
return isl_printer_free(p);
return p;
}
/* isl_id_to_ast_expr_foreach callback that prints the required
* macro definitions for "val".
*/
static isl_stat print_expr_macros(__isl_take isl_id *key,
__isl_take isl_ast_expr *val, void *user)
{
isl_printer **p = user;
*p = ppcg_ast_expr_print_macros(val, *p);
isl_id_free(key);
isl_ast_expr_free(val);
if (!*p)
return isl_stat_error;
return isl_stat_ok;
}
/* Print the required macro definitions for the body of a statement in which
* the access expressions are replaced by the isl_ast_expr objects
* in "ref2expr".
*/
__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
__isl_keep isl_id_to_ast_expr *ref2expr)
{
if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0)
return isl_printer_free(p);
return p;
}
/* Print the required macros for "node".
*/
__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
__isl_keep isl_ast_node *node)
{
if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
return isl_printer_free(p);
return p;
}
/* Names used for the macros that may appear in a printed isl AST.
*/
const char *ppcg_min = "ppcg_min";
const char *ppcg_max = "ppcg_max";
const char *ppcg_fdiv_q = "ppcg_fdiv_q";
/* Set the names of the macros that may appear in a printed isl AST.
*/
__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p)
{
p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min);
p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max);
p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q);
return p;
}
/* Given a multi affine expression "mpa" without domain, modify it to have
* the schedule space of "build" as domain.
*
* If the schedule space of "build" is a parameter space, then nothing
* needs to be done.
* Otherwise, "mpa" is first given a 0D domain and then it is combined
* with a mapping from the schedule space of "build" to the same 0D domain.
*/
__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff(
__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build)
{
isl_bool params;
isl_space *space;
isl_multi_aff *ma;
space = isl_ast_build_get_schedule_space(build);
params = isl_space_is_params(space);
if (params < 0 || params) {
isl_space_free(space);
if (params < 0)
return isl_multi_pw_aff_free(mpa);
return mpa;
}
space = isl_space_from_domain(space);
ma = isl_multi_aff_zero(space);
mpa = isl_multi_pw_aff_from_range(mpa);
mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma);
return mpa;
}
/* Build an access AST expression from "size" using "build".
* "size" does not have a domain, but "build" may have a proper schedule space.
* First modify "size" to have that schedule space as domain.
*/
__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
__isl_keep isl_ast_build *build)
{
size = ppcg_attach_multi_pw_aff(size, build);
return isl_ast_build_access_from_multi_pw_aff(build, size);
}
/* Print a declaration for an array with element type "base_type" and
* size "size" to "p".
*/
__isl_give isl_printer *ppcg_print_declaration_with_size(
__isl_take isl_printer *p, const char *base_type,
__isl_keep isl_ast_expr *size)
{
if (!base_type || !size)
return isl_printer_free(p);
p = ppcg_ast_expr_print_macros(size, p);
p = isl_printer_start_line(p);
p = isl_printer_print_str(p, base_type);
p = isl_printer_print_str(p, " ");
p = isl_printer_print_ast_expr(p, size);
p = isl_printer_print_str(p, ";");
p = isl_printer_end_line(p);
return p;
}
/* Print a declaration for array "array" to "p", using "build"
* to simplify any size expressions.
*
* The size is computed from the extent of the array and is
* subsequently converted to an "access expression" by "build".
*/
__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
struct pet_array *array, __isl_keep isl_ast_build *build)
{
isl_multi_pw_aff *size;
isl_ast_expr *expr;
if (!array)
return isl_printer_free(p);
size = ppcg_size_from_extent(isl_set_copy(array->extent));
expr = isl_ast_build_access_from_multi_pw_aff(build, size);
p = ppcg_print_declaration_with_size(p, array->element_type, expr);
isl_ast_expr_free(expr);
return p;
}
/* Print declarations for the arrays in "scop" that are declared
* and that are exposed (if exposed == 1) or not exposed (if exposed == 0).
*/
static __isl_give isl_printer *print_declarations(__isl_take isl_printer *p,
struct ppcg_scop *scop, int exposed)
{
int i;
isl_ast_build *build;
if (!scop)
return isl_printer_free(p);
build = isl_ast_build_from_context(isl_set_copy(scop->context));
for (i = 0; i < scop->pet->n_array; ++i) {
struct pet_array *array = scop->pet->arrays[i];
if (!array->declared)
continue;
if (array->exposed != exposed)
continue;
p = ppcg_print_declaration(p, array, build);
}
isl_ast_build_free(build);
return p;
}
/* Print declarations for the arrays in "scop" that are declared
* and exposed to the code after the scop.
*/
__isl_give isl_printer *ppcg_print_exposed_declarations(
__isl_take isl_printer *p, struct ppcg_scop *scop)
{
return print_declarations(p, scop, 1);
}
/* Print declarations for the arrays in "scop" that are declared,
* but not exposed to the code after the scop.
*/
__isl_give isl_printer *ppcg_print_hidden_declarations(
__isl_take isl_printer *p, struct ppcg_scop *scop)
{
return print_declarations(p, scop, 0);
}

View File

@ -1,40 +0,0 @@
#ifndef PRINT_H
#define PRINT_H
#include <isl/ast.h>
#include "ppcg.h"
extern const char *ppcg_min;
extern const char *ppcg_max;
extern const char *ppcg_fdiv_q;
__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p);
__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p);
__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p);
__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
const char *min, const char *max);
__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
__isl_take isl_printer *p);
__isl_give isl_printer *ppcg_ast_expr_print_macros(
__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p);
__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
__isl_keep isl_id_to_ast_expr *ref2expr);
__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
__isl_keep isl_ast_node *node);
__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
__isl_keep isl_ast_build *build);
__isl_give isl_printer *ppcg_print_declaration_with_size(
__isl_take isl_printer *p, const char *base_type,
__isl_keep isl_ast_expr *size);
__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
struct pet_array *array, __isl_keep isl_ast_build *build);
__isl_give isl_printer *ppcg_print_exposed_declarations(
__isl_take isl_printer *p, struct ppcg_scop *scop);
__isl_give isl_printer *ppcg_print_hidden_declarations(
__isl_take isl_printer *p, struct ppcg_scop *scop);
#endif

View File

@ -1,165 +0,0 @@
/*
* Copyright 2010-2011 INRIA Saclay
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
* Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
* 91893 Orsay, France
*/
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <isl/set.h>
#include <isl/map.h>
#include <isl/constraint.h>
#include "schedule.h"
/* Add parameters with identifiers "ids" to "set".
*/
static __isl_give isl_set *add_params(__isl_take isl_set *set,
__isl_keep isl_id_list *ids)
{
int i, n;
unsigned nparam;
n = isl_id_list_n_id(ids);
nparam = isl_set_dim(set, isl_dim_param);
set = isl_set_add_dims(set, isl_dim_param, n);
for (i = 0; i < n; ++i) {
isl_id *id;
id = isl_id_list_get_id(ids, i);
set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
}
return set;
}
/* Equate the dimensions of "set" starting at "first" to
* freshly created parameters with identifiers "ids".
* The number of equated dimensions is equal to the number of elements in "ids".
*/
static __isl_give isl_set *parametrize(__isl_take isl_set *set,
int first, __isl_keep isl_id_list *ids)
{
int i, n;
unsigned nparam;
nparam = isl_set_dim(set, isl_dim_param);
set = add_params(set, ids);
n = isl_id_list_n_id(ids);
for (i = 0; i < n; ++i)
set = isl_set_equate(set, isl_dim_param, nparam + i,
isl_dim_set, first + i);
return set;
}
/* Given a parameter space "space", create a set of dimension "len"
* of which the dimensions starting at "first" are equated to
* freshly created parameters with identifiers "ids".
*/
__isl_give isl_set *parametrization(__isl_take isl_space *space,
int len, int first, __isl_keep isl_id_list *ids)
{
isl_set *set;
space = isl_space_set_from_params(space);
space = isl_space_add_dims(space, isl_dim_set, len);
set = isl_set_universe(space);
return parametrize(set, first, ids);
}
/* Load and return a schedule from a file called "filename".
*/
static __isl_give isl_schedule *load_schedule(isl_ctx *ctx,
const char *filename)
{
FILE *file;
isl_schedule *schedule;
file = fopen(filename, "r");
if (!file) {
fprintf(stderr, "Unable to open '%s' for reading\n", filename);
return NULL;
}
schedule = isl_schedule_read_from_file(ctx, file);
fclose(file);
return schedule;
}
/* Save the schedule "schedule" to a file called "filename".
* The schedule is printed in block style.
*/
static void save_schedule(__isl_keep isl_schedule *schedule,
const char *filename)
{
FILE *file;
isl_ctx *ctx;
isl_printer *p;
if (!schedule)
return;
file = fopen(filename, "w");
if (!file) {
fprintf(stderr, "Unable to open '%s' for writing\n", filename);
return;
}
ctx = isl_schedule_get_ctx(schedule);
p = isl_printer_to_file(ctx, file);
p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
p = isl_printer_print_schedule(p, schedule);
isl_printer_free(p);
fclose(file);
}
/* Obtain a schedule, either by reading it form a file
* or by computing it using "compute".
* Also take care of saving the computed schedule and/or
* dumping the obtained schedule if requested by the user.
*/
__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
struct ppcg_options *options,
__isl_give isl_schedule *(*compute)(void *user), void *user)
{
isl_schedule *schedule;
if (options->load_schedule_file) {
schedule = load_schedule(ctx, options->load_schedule_file);
} else {
schedule = compute(user);
if (options->save_schedule_file)
save_schedule(schedule, options->save_schedule_file);
}
if (options->debug->dump_schedule)
isl_schedule_dump(schedule);
return schedule;
}
/* Mark all dimensions in the band node "node" to be of "type".
*/
__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
__isl_take isl_schedule_node *node, enum isl_ast_loop_type type)
{
int i, n;
n = isl_schedule_node_band_n_member(node);
for (i = 0; i < n; ++i)
node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
type);
return node;
}

View File

@ -1,21 +0,0 @@
#ifndef _SCHEDULE_H
#define _SCHEDULE_H
#include <isl/id.h>
#include <isl/space.h>
#include <isl/schedule.h>
#include <isl/schedule_node.h>
#include "ppcg_options.h"
__isl_give isl_set *parametrization(__isl_take isl_space *space,
int len, int first, __isl_keep isl_id_list *names);
__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
struct ppcg_options *options,
__isl_give isl_schedule *(*compute)(void *user), void *user);
__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
__isl_take isl_schedule_node *node, enum isl_ast_loop_type type);
#endif

View File

@ -1,49 +0,0 @@
#include <stdlib.h>
int main()
{
int A[2][1000][1000];
int B[2][1000][1000];
#pragma scop
{
for (int i = 0; i < 256; ++i)
for (int j = 0; j < 256; ++j)
if (j % 8 <= 2 || j % 8 >= 6)
A[1][i][j] = B[1][j][i];
}
#pragma endscop
/*
When compiled with:
./ppcg tests/allow-sparse-copy-in.c --no-linearize-device-arrays
--on-error=abort --sizes='{kernel[i]->tile[8,8]; kernel[i]->block[1,8]}'
--max-shared-memory=-1 --unroll-copy-shared
this originally resulted in the following copy-in code:
shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
shared_B[0][3][t1] = B[1][8 * b1 + 3][8 * b0 + t1];
shared_B[0][4][t1] = B[1][8 * b1 + 4][8 * b0 + t1];
shared_B[0][5][t1] = B[1][8 * b1 + 5][8 * b0 + t1];
shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
whereas we only want to only perform copies that are actually needed:
shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
*/
for (int i = 0; i < 100; ++i)
if (A[1][0][i] != i)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,29 +0,0 @@
#include <stdlib.h>
void copy_summary(int b[1000], int a[1000], int pos)
{
b[pos] = 0;
int c = a[pos];
}
#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[1000], int a[1000], int pos);
int main()
{
int a[1000], b[1000];
for (int i = 0; i < 1000; ++i)
a[i] = i;
#pragma scop
for (int i = 0; i < 1000; ++i)
copy(b, a, i);
#pragma endscop
for (int i = 0; i < 1000; ++i)
if (b[i] != a[i])
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,29 +0,0 @@
#include <stdlib.h>
void copy_summary(int b[1000], int a[1000], int pos)
{
b[pos] = 0;
int c = a[pos];
}
#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[1000], int a[1000], int pos);
int main()
{
int a[2][1000];
for (int i = 0; i < 1000; ++i)
a[0][i] = i;
#pragma scop
for (int i = 0; i < 1000; ++i)
copy(a[1], a[0], i);
#pragma endscop
for (int i = 0; i < 1000; ++i)
if (a[1][i] != a[0][i])
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,4 +0,0 @@
void copy(__global int b[1000], __global int a[1000], int pos)
{
b[pos] = a[pos];
}

View File

@ -1,32 +0,0 @@
#include <stdlib.h>
void copy_summary(int b[100], int a[100])
{
for (int i = 0; i < 100; ++i) {
b[i] = 0;
int c = a[i];
}
}
#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[100], int a[100]);
int main()
{
int A[100][100], B[100];
for (int i = 0; i < 100; ++i)
B[i] = i;
#pragma scop
for (int i = 0; i < 100; ++i)
copy(A[i], B);
#pragma endscop
for (int i = 0; i < 100; ++i)
for (int j = 0; j < 100; ++j)
if (A[j][i] != B[i])
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,5 +0,0 @@
void copy(__global int b[100], __global int a[100])
{
for (int i = 0; i < 100; ++i)
b[i] = a[i];
}

View File

@ -1,4 +0,0 @@
void copy(__global int b[1000], __global int a[1000], int pos)
{
b[pos] = a[pos];
}

View File

@ -1,23 +0,0 @@
#include <stdlib.h>
int main()
{
int a[1000], b[1000];
for (int i = 0; i < 1000; ++i)
a[i] = i;
#pragma scop
for (int i = 0; i < 1000; ++i) {
int c;
int d;
c = a[i];
d = c;
b[i] = c;
}
#pragma endscop
for (int i = 0; i < 1000; ++i)
if (b[i] != a[i])
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,18 +0,0 @@
#include <stdlib.h>
int main()
{
int i;
int a[101];
i = 0;
#pragma scop
for (i = 0; i < 100; ++i)
a[i] = i;
a[i] = i;
#pragma endscop
if (a[100] != 100)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,22 +0,0 @@
#include <stdlib.h>
/* Check that a write access is not removed from the live-out
* accesses only because a strict subset of the (potentially)
* accessed elements are killed by a later write.
*/
int main()
{
int A[10];
A[1] = 0;
#pragma scop
int i = 1;
i = i * i;
A[i] = 1;
A[0] = 0;
#pragma endscop
if (A[1] != 1)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,22 +0,0 @@
#include <stdlib.h>
int main()
{
int A[100];
#pragma scop
{
int B[100];
B[0] = 0;
for (int i = 1; i < 100; ++i)
B[i] = B[i - 1] + 1;
for (int i = 0; i < 100; ++i)
A[i] = B[i];
}
#pragma endscop
for (int i = 0; i < 100; ++i)
if (A[i] != i)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,18 +0,0 @@
#include <stdlib.h>
int main()
{
int a[1000], b[1000];
for (int i = 0; i < 1000; ++i)
a[i] = i;
#pragma scop
for (int i = 0; i < 1000; ++i)
b[i] = a[i];
#pragma endscop
for (int i = 0; i < 1000; ++i)
if (b[i] != a[i])
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,29 +0,0 @@
#include <stdlib.h>
void copy_summary(int b[1000], int a[1000], int pos, int c[1000])
{
b[pos] = 0;
int d = a[pos];
}
#ifdef pencil_access
__attribute__((pencil_access(copy_summary)))
#endif
void copy(int b[1000], int a[1000], int pos, int c[1000]);
int main()
{
int a[1000], b[1000], c[1000];
for (int i = 0; i < 1000; ++i)
a[i] = i;
#pragma scop
for (int i = 0; i < 1000; ++i)
copy(b, a, i, c);
#pragma endscop
for (int i = 0; i < 1000; ++i)
if (b[i] != a[i])
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,5 +0,0 @@
void copy(__global int b[1000], __global int a[1000], int pos,
__global int c[1000])
{
b[pos] = a[pos];
}

View File

@ -1,13 +0,0 @@
#include <stdlib.h>
int main()
{
int a;
#pragma scop
a = 1;
#pragma endscop
if (a != 1)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,25 +0,0 @@
#include <stdlib.h>
/* Check that the sources of live ranges with the same sink
* are executed in order.
*/
int main()
{
int A[128];
int n = 128;
A[0] = 0;
#pragma scop
for (int i = 0; i < n; ++i) {
int set = 0;
if (A[i] < 2)
set = 1;
if (set)
A[i] = 2;
}
#pragma endscop
if (A[0] != 2)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,31 +0,0 @@
#include <stdlib.h>
struct s {
int c[10][10];
};
int main()
{
struct s a[10][10], b[10][10];
for (int i = 0; i < 10; ++i)
for (int j = 0; j < 10; ++j)
for (int k = 0; k < 10; ++k)
for (int l = 0; l < 10; ++l)
a[i][j].c[k][l] = i + j + k + l;
#pragma scop
for (int i = 0; i < 10; ++i)
for (int j = 0; j < 10; ++j)
for (int k = 0; k < 10; ++k)
for (int l = 0; l < 10; ++l)
b[i][j].c[k][l] = i + j + k + l;
#pragma endscop
for (int i = 0; i < 10; ++i)
for (int j = 0; j < 10; ++j)
for (int k = 0; k < 10; ++k)
for (int l = 0; l < 10; ++l)
if (b[i][j].c[k][l] != a[i][j].c[k][l])
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,21 +0,0 @@
#include <stdlib.h>
struct s {
int a;
};
int main()
{
struct s a, b[10];
#pragma scop
a.a = 42;
for (int i = 0; i < 10; ++i)
b[i].a = a.a;
#pragma endscop
for (int i = 0; i < 10; ++i)
if (b[i].a != 42)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,25 +0,0 @@
#include <stdlib.h>
struct s {
int a;
int b;
};
int main()
{
struct s a, b[10];
a.b = 57;
#pragma scop
a.a = 42;
for (int i = 0; i < 10; ++i)
b[i] = a;
#pragma endscop
for (int i = 0; i < 10; ++i)
if (b[i].a != 42)
return EXIT_FAILURE;
if (a.b != 57)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,27 +0,0 @@
#include <stdlib.h>
struct s {
int a;
int b;
};
int main()
{
int a[10];
for (int i = 0; i < 10; ++i)
a[i] = 0;
#pragma scop
for (int i = 0; i < 10; ++i) {
struct s b;
b.a = 1;
b.b = i;
a[i] = b.a + b.b;
}
#pragma endscop
for (int i = 0; i < 10; ++i)
if (a[i] != 1 + i)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

View File

@ -1,105 +0,0 @@
/*
* Copyright 2012-2013 Ecole Normale Superieure
*
* Use of this software is governed by the MIT license
*
* Written by Sven Verdoolaege,
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
*/
#include <isl/space.h>
#include <isl/val.h>
#include <isl/aff.h>
#include <isl/set.h>
#include "util.h"
/* Construct an isl_multi_val living in "space" with all values equal to "val".
*/
__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
int val)
{
int i, n;
isl_ctx *ctx;
isl_val *v;
isl_multi_val *mv;
if (!space)
return NULL;
ctx = isl_space_get_ctx(space);
n = isl_space_dim(space, isl_dim_set);
mv = isl_multi_val_zero(space);
v = isl_val_int_from_si(ctx, val);
for (i = 0; i < n; ++i)
mv = isl_multi_val_set_val(mv, i, isl_val_copy(v));
isl_val_free(v);
return mv;
}
/* Construct an isl_multi_val living in "space" with values specified
* by "list". "list" is assumed to have at least as many entries
* as the set dimension of "space".
*/
__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
__isl_take isl_space *space, int *list)
{
int i, n;
isl_ctx *ctx;
isl_multi_val *mv;
if (!space)
return NULL;
ctx = isl_space_get_ctx(space);
n = isl_space_dim(space, isl_dim_set);
mv = isl_multi_val_zero(space);
for (i = 0; i < n; ++i) {
isl_val *v;
v = isl_val_int_from_si(ctx, list[i]);
mv = isl_multi_val_set_val(mv, i, v);
}
return mv;
}
/* Compute the size of a bounding box around the origin and "set",
* where "set" is assumed to contain only non-negative elements.
* In particular, compute the maximal value of "set" in each direction
* and add one.
*/
__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set)
{
int i, n;
isl_multi_pw_aff *mpa;
n = isl_set_dim(set, isl_dim_set);
mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
for (i = 0; i < n; ++i) {
isl_space *space;
isl_aff *one;
isl_pw_aff *bound;
if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) {
const char *name;
name = isl_set_get_tuple_name(set);
if (!name)
name = "";
fprintf(stderr, "unable to determine extent of '%s' "
"in dimension %d\n", name, i);
set = isl_set_free(set);
}
bound = isl_set_dim_max(isl_set_copy(set), i);
space = isl_pw_aff_get_domain_space(bound);
one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
one = isl_aff_add_constant_si(one, 1);
bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
}
isl_set_free(set);
return mpa;
}

View File

@ -1,22 +0,0 @@
#ifndef UTIL_H
#define UTIL_H
#include <string.h>
#include <isl/space.h>
#include <isl/val.h>
/* Compare the prefix of "s" to "prefix" up to the length of "prefix".
*/
static inline int prefixcmp(const char *s, const char *prefix)
{
return strncmp(s, prefix, strlen(prefix));
}
__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
int val);
__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
__isl_take isl_space *space, int *list);
__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set);
#endif

View File

@ -1,6 +0,0 @@
#include "gitversion.h"
const char *ppcg_version(void)
{
return GIT_HEAD_ID"\n";
}

View File

@ -217,14 +217,6 @@ static StaticInitializer InitializeEverything;
void initializePollyPasses(llvm::PassRegistry &Registry) {
initializeCodeGenerationPass(Registry);
#ifdef GPU_CODEGEN
initializePPCGCodeGenerationPass(Registry);
initializeManagedMemoryRewritePassPass(Registry);
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
#endif
initializeCodePreparationPass(Registry);
initializeDeadCodeElimWrapperPassPass(Registry);
initializeDependenceInfoPass(Registry);

View File

@ -711,11 +711,6 @@ static void runIslScheduleOptimizer(
function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
isl::schedule &LastSchedule, bool &DepsChanged) {
// Skip SCoPs in case they're already optimised by PPCGCodeGeneration
if (S.isToBeSkipped())
return;
// Skip empty SCoPs but still allow code generation as it will delete the
// loops present but not needed.
if (S.getSize() == 0) {

View File

@ -1,9 +0,0 @@
define float @__nv_expf(float %a) {
ret float %a
}
define float @__nv_cosf(float %a) {
ret float %a
}
define float @__nv_logf(float %a) {
ret float %a
}

View File

@ -1,71 +0,0 @@
; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
; REQUIRES: pollyacc
; Check that we detect a scop.
; SCOP: Function: checkScalarKill
; SCOP-NEXT: Region: %XLoopInit---%for.end
; SCOP-NEXT: Max Loop Depth: 1
; Check that we have a scalar that is not a phi node in the scop.
; SCOP: i32 MemRef_x_0; // Element size 4
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
; Check that we add variables that are local to a scop into the kills that we
; pass to PPCG. This should enable PPCG to codegen this example.
; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
; int x;
; #pragma scop
; for(int i = 0; i < 1000; i++) {
; XLoopInit: x = 0;
;
; if (control1 > 2)
; C1Add: x += 10;
; if (control2 > 3)
; C2Add: x += A[i];
;
; BLoopAccumX: B[i] += x;
; }
;
; #pragma endscop
; }
; ModuleID = 'test.ll'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %XLoopInit
XLoopInit: ; preds = %entry.split, %BLoopAccumX
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
%cmp1 = icmp sgt i32 %control1, 2
%x.0 = select i1 %cmp1, i32 10, i32 0
%cmp2 = icmp sgt i32 %control2, 3
br i1 %cmp2, label %C2Add, label %BLoopAccumX
C2Add: ; preds = %XLoopInit
%arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
%tmp6 = load i32, ptr %arrayidx, align 4
%add4 = add nsw i32 %tmp6, %x.0
br label %BLoopAccumX
BLoopAccumX: ; preds = %XLoopInit, %C2Add
%x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
%arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
%tmp11 = load i32, ptr %arrayidx7, align 4
%add8 = add nsw i32 %tmp11, %x.1
store i32 %add8, ptr %arrayidx7, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 1000
br i1 %exitcond, label %XLoopInit, label %for.end
for.end: ; preds = %BLoopAccumX
ret void
}

View File

@ -1,53 +0,0 @@
; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \
; RUN: FileCheck %s
; REQUIRES: pollyacc
; CHECK: polly_launchKernel
; Verify that this program compiles. At some point, this compilation crashed
; due to insufficient parameters being available.
source_filename = "bugpoint-output-4d01492.bc"
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] }
%struct.widget = type { i64, i64, i64 }
@global = external unnamed_addr global %struct.barney, align 32
; Function Attrs: nounwind uwtable
define void @wobble(ptr noalias %arg) #0 {
bb:
%tmp = load i32, ptr %arg, align 4
br label %bb1
bb1: ; preds = %bb13, %bb
%tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ]
br label %bb3
bb3: ; preds = %bb3, %bb1
%tmp4 = load ptr, ptr @global, align 32
%tmp5 = sext i32 %tmp2 to i64
%tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8
%tmp7 = mul i64 %tmp6, %tmp5
%tmp8 = add i64 %tmp7, 0
%tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8
%tmp10 = add i64 %tmp8, %tmp9
%tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10
store i32 undef, ptr %tmp11, align 4
%tmp12 = icmp eq i32 0, 0
br i1 %tmp12, label %bb13, label %bb3
bb13: ; preds = %bb3
%tmp14 = icmp eq i32 %tmp2, %tmp
%tmp15 = add i32 %tmp2, 1
br i1 %tmp14, label %bb16, label %bb1
bb16: ; preds = %bb13
ret void
}
attributes #0 = { nounwind uwtable }

View File

@ -1,50 +0,0 @@
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
; RUN: -polly-use-llvm-names < %s
; ModuleID = 'test/GPGPU/zero-size-array.ll'
; REQUIRES: pollyacc
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
; We used to divide the element size by 8 to arrive at the 'actual' size
; of an array element. This used to cause arrays that have an element size
; of less than 8 to collapse to size 0. This test makes sure that it does
; not happen anymore.
; f(int *niters_ptr, int *arr[0]) {
; const int inters = *niters_ptr;
; for(int i = 0; i < niters; i++) {
; arr[0][i + 1] = 0
; }
; }
; Function Attrs: nounwind uwtable
define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 {
entry:
%niters = load i32, ptr %niters.ptr, align 4
br label %loop.body
loop.body: ; preds = %loop.body, %entry
%indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ]
%indvar.sext = sext i32 %indvar to i64
%arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext
store i32 0, ptr %arr.slot, align 4
%tmp8 = icmp eq i32 %indvar, %niters
%indvar.next = add i32 %indvar, 1
br i1 %tmp8, label %loop.exit, label %loop.body
loop.exit: ; preds = %loop.body
%tmp10 = icmp sgt i32 undef, 0
br label %auxiliary.loop
auxiliary.loop: ; preds = %"101", %loop.exit
%tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ]
br i1 undef, label %auxiliary.loop, label %exit
exit: ; preds = %auxiliary.loop
ret void
}
attributes #0 = { nounwind uwtable }

View File

@ -1,55 +0,0 @@
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
; RUN: -polly-ignore-parameter-bounds \
; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR
;
; REQUIRES: pollyacc
; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain
; all the parameters present in the program.
;
; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff`
; to have the same parameter dimensions. To achieve this, we used to realign
; every `pw_aff` with `Scop::Context`. However, in conjunction with
; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context`
; does not contain all parameters.
;
; We check that Polly does the right thing in this case and sets up the parameter
; dimensions correctly.
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
; C pseudocode
; ------------
; void f(int *arr, long niters, long stride) {
; for(int i = 0; i < niters; i++) {
; arr[i * stride] = 1;
; }
; }
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 {
entry:
br label %loop
loop: ; preds = %loop, %entry
%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
%idx = mul nuw nsw i64 %indvar, %stride
%slot = getelementptr i32, ptr %arr, i64 %idx
store i32 1, ptr %slot, align 4
%indvar.next = add nuw nsw i64 %indvar, 1
%check = icmp sgt i64 %indvar.next, %niters
br i1 %check, label %exit, label %loop
exit: ; preds = %loop
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind uwtable }

View File

@ -1,37 +0,0 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=KERNEL %s
; REQUIRES: pollyacc
; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 {
; KERNEL: !nvvm.annotations = !{!0}
; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo(ptr %A, i64 %n) {
bb:
br label %bb1
bb1: ; preds = %bb6, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
%tmp = icmp slt i64 %i.0, %n
br i1 %tmp, label %bb2, label %bb8
bb2: ; preds = %bb1
%tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0
%tmp4 = load i64, ptr %tmp3, align 8
%tmp5 = add nsw i64 %tmp4, 100
store i64 %tmp5, ptr %tmp3, align 8
br label %bb6
bb6: ; preds = %bb2
%tmp7 = add nuw nsw i64 %i.0, 1
br label %bb1
bb8: ; preds = %bb1
ret void
}

View File

@ -1,118 +0,0 @@
; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
; RUN: FileCheck %s
; REQUIRES: pollyacc
;
; #include <cuda_runtime.h>
;
; static const int N = 45;
;
; void copy(int *R, int *A) {
; for (int i = 0; i < N; i++) {
; R[i] = A[i] * 10;
; }
; }
;
; int main() {
; int *A, *R;
;
; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
;
; for (int i = 0; i < N; i++) {
; A[i] = i;
; R[i] = 0;
; }
; copy(R, A);
;
; return 0;
; }
;
; CHECK-NOT: polly_copyFromHostToDevice
; CHECK-NOT: polly_copyFromDeviceToHost
; CHECK-NOT: polly_freeDeviceMemory
; CHECK-NOT: polly_allocateMemoryForDevice
; CHECK: %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
; CHECK-NEXT: %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
; CHECK-NEXT: %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
; CHECK-NEXT: %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
; CHECK-NEXT: store i8* %[[REGCA]], i8** %polly_launch_0_param_0
; CHECK-NEXT: %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
; CHECK-NEXT: store i8* %[[REGCP0]], i8** %[[REGGEP0]]
; CHECK-NEXT: %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
; CHECK-NEXT: store i8* %[[REGCR]], i8** %polly_launch_0_param_1
; CHECK-NEXT: %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
; CHECK-NEXT: store i8* %[[REGCP1]], i8** %[[REGGEP1]]
; CHECK-NEXT: %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
; CHECK-NEXT: call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
; CHECK-NEXT: call void @polly_freeKernel(i8* %[[REGKERNEL]])
; CHECK-NEXT: call void @polly_synchronizeDevice()
; CHECK-NEXT: call void @polly_freeContext(i8* %[[REGCTX]])
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @copy(i32* %R, i32* %A) {
entry:
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
%exitcond = icmp ne i64 %indvars.iv, 45
br i1 %exitcond, label %for.body, label %for.end
for.body: ; preds = %for.cond
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
%tmp = load i32, i32* %arrayidx, align 4
%mul = mul nsw i32 %tmp, 10
%arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
store i32 %mul, i32* %arrayidx2, align 4
br label %for.inc
for.inc: ; preds = %for.body
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
define i32 @main() {
entry:
%A = alloca i32*, align 8
%R = alloca i32*, align 8
%tmp = bitcast i32** %A to i8**
%call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
%tmp1 = bitcast i32** %R to i8**
%call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
%exitcond = icmp ne i64 %indvars.iv, 45
br i1 %exitcond, label %for.body, label %for.end
for.body: ; preds = %for.cond
%tmp2 = load i32*, i32** %A, align 8
%arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
%tmp3 = trunc i64 %indvars.iv to i32
store i32 %tmp3, i32* %arrayidx, align 4
%tmp4 = load i32*, i32** %R, align 8
%arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
store i32 0, i32* %arrayidx3, align 4
br label %for.inc
for.inc: ; preds = %for.body
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
br label %for.cond
for.end: ; preds = %for.cond
%tmp5 = load i32*, i32** %R, align 8
%tmp6 = load i32*, i32** %A, align 8
call void @copy(i32* %tmp5, i32* %tmp6)
ret i32 0
}
declare i32 @cudaMallocManaged(i8**, i64, i32) #1

View File

@ -1,104 +0,0 @@
; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: | FileCheck --check-prefix=KERNEL-IR %s
; REQUIRES: pollyacc
; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 {
; The instruction marked <<<LeakyInst>>> is copied into the GPUModule,
; with changes only to the parameters to access data on the device instead of
; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the
; instruction is annotated with a DILocation, copying the instruction also copies
; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by
; failing the verification of the Module in GPUNodeBuilder::finalize, due to the
; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied
; nor created.
;
; https://reviews.llvm.org/D35630 removes this debug metadata before the
; instruction is copied to the GPUModule.
;
; vec_add_1.c:
; void vec_add_1(int N, int arr[N]) {
; int i=0;
; for( i=0 ; i<N ; i++) arr[i] += 1;
; }
;
source_filename = "vec_add_1.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @vec_add_1(i32 %N, ptr %arr) !dbg !7 {
entry:
call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17
call void @llvm.dbg.value(metadata ptr %arr, i64 0, metadata !14, metadata !16), !dbg !18
call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
%tmp = sext i32 %N to i64, !dbg !20
br label %for.cond, !dbg !20
for.cond: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19
%cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22
br i1 %cmp, label %for.body, label %for.end, !dbg !24
for.body: ; preds = %for.cond
%arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv, !dbg !25
%tmp1 = load i32, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
%add = add nsw i32 %tmp1, 1, !dbg !26 ; <<<LeakyInst>>>
store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
br label %for.inc, !dbg !25
for.inc: ; preds = %for.body
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31
call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19
br label %for.cond, !dbg !32, !llvm.loop !33
for.end: ; preds = %for.cond
ret void, !dbg !35
}
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!5 = !{i32 1, !"wchar_size", i32 4}
!6 = !{!"clang version 5.0.0"}
!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!8 = !DISubroutineType(types: !9)
!9 = !{null, !10, !11}
!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
!12 = !{!13, !14, !15}
!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10)
!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11)
!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10)
!16 = !DIExpression()
!17 = !DILocation(line: 1, column: 20, scope: !7)
!18 = !DILocation(line: 1, column: 27, scope: !7)
!19 = !DILocation(line: 2, column: 7, scope: !7)
!20 = !DILocation(line: 3, column: 8, scope: !21)
!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
!22 = !DILocation(line: 3, column: 15, scope: !23)
!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3)
!24 = !DILocation(line: 3, column: 3, scope: !21)
!25 = !DILocation(line: 3, column: 25, scope: !23)
!26 = !DILocation(line: 3, column: 32, scope: !23)
!27 = !{!28, !28, i64 0}
!28 = !{!"int", !29, i64 0}
!29 = !{!"omnipotent char", !30, i64 0}
!30 = !{!"Simple C/C++ TBAA"}
!31 = !DILocation(line: 3, column: 21, scope: !23)
!32 = !DILocation(line: 3, column: 3, scope: !23)
!33 = distinct !{!33, !24, !34}
!34 = !DILocation(line: 3, column: 35, scope: !21)
!35 = !DILocation(line: 4, column: 1, scope: !7)

View File

@ -1,254 +0,0 @@
; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=SCHED %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
; RUN: FileCheck %s -check-prefix=IR
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: FileCheck %s -check-prefix=KERNEL-IR
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
; RUN: -disable-output < %s | \
; RUN: FileCheck %s -check-prefix=KERNEL-ASM
; XFAIL: *
; REQUIRES: pollyacc, target=nvptx{{.*}}
; This fails today due to extensive output differences from when the test was written.
; CHECK: Stmt_bb5
; CHECK-NEXT: Domain :=
; CHECK-NEXT: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
; CHECK-NEXT: Schedule :=
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> [i0, i1] };
; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
; SCHED-NEXT: child:
; SCHED-NEXT: context: "{ [] }"
; SCHED-NEXT: child:
; SCHED-NEXT: extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: sequence:
; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: set:
; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: guard: "{ [] }"
; SCHED-NEXT: - filter: "{ Stmt_bb5[i0, i1] }"
; SCHED-NEXT: child:
; SCHED-NEXT: guard: "{ [] }"
; SCHED-NEXT: child:
; SCHED-NEXT: mark: "kernel"
; SCHED-NEXT: child:
; SCHED-NEXT: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
; SCHED-NEXT: child:
; SCHED-NEXT: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
; SCHED-NEXT: child:
; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
; SCHED-NEXT: permutable: 1
; SCHED-NEXT: coincident: [ 1, 1 ]
; SCHED-NEXT: child:
; SCHED-NEXT: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
; SCHED-NEXT: child:
; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
; SCHED-NEXT: permutable: 1
; SCHED-NEXT: coincident: [ 1, 1 ]
; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: set:
; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }"
; SCHED-NEXT: child:
; SCHED-NEXT: guard: "{ [] }"
; CODE: Code
; CODE-NEXT: ====
; CODE-NEXT: # host
; CODE-NEXT: {
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(16, 32);
; CODE-NEXT: dim3 k0_dimGrid(32, 32);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: }
; CODE: # kernel0
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
; IR: polly.split_new_and_old:
; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1
; IR-NEXT: %polly.overflow.state = or i1 false, %.obit
; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0
; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1
; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0
; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1
; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0
; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1
; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0
; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440
; IR-NEXT: %5 = and i1 true, %4
; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown
; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2
; IR: polly.start:
; IR-NEXT: br label %polly.acc.initialize
; IR: polly.acc.initialize:
; IR-NEXT: [[GPUContext:%.*]] = call ptr @polly_initContext()
; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304)
; IR-NEXT: call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304)
; IR-NEXT: [[DevPtr:%.*]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A)
; IR-NEXT: store ptr [[DevPtr]], ptr %polly_launch_0_param_0
; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params
; IR-NEXT: call ptr @polly_getKernel
; IR-NEXT: call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr)
; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304)
; IR-NEXT: call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A)
; IR-NEXT: call void @polly_freeContext(ptr [[GPUContext]])
; IR-NEXT: br label %polly.exiting
; IR: polly.exiting:
; IR-NEXT: br label %polly.merge_new_and_old
; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 {
; KERNEL-IR-NEXT: entry:
; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64
; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
; KERNEL-IR-NEXT: %b1 = zext i32 %1 to i64
; KERNEL-IR-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; KERNEL-IR-NEXT: %t0 = zext i32 %2 to i64
; KERNEL-IR-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
; KERNEL-IR-NEXT: %t1 = zext i32 %3 to i64
; KERNEL-IR-NEXT: br label %polly.loop_preheader
; KERNEL-IR-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5
; KERNEL-IR-NEXT: ret void
; KERNEL-IR-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader
; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0
; KERNEL-IR-NEXT: %6 = mul nsw i64 32, %b1
; KERNEL-IR-NEXT: %7 = add nsw i64 %6, %t1
; KERNEL-IR-NEXT: %8 = mul nsw i64 16, %polly.indvar
; KERNEL-IR-NEXT: %9 = add nsw i64 %7, %8
; KERNEL-IR-NEXT: br label %polly.stmt.bb5
; KERNEL-IR-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header
; KERNEL-IR-NEXT: %10 = mul i64 %5, %9
; KERNEL-IR-NEXT: %p_tmp6 = sitofp i64 %10 to float
; KERNEL-IR-NEXT: %11 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %12 = add nsw i64 %11, %t0
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b1
; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t1
; KERNEL-IR-NEXT: %15 = mul nsw i64 16, %polly.indvar
; KERNEL-IR-NEXT: %16 = add nsw i64 %14, %15
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A
; KERNEL-IR-NEXT: %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4
; KERNEL-IR-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
; KERNEL-IR-NEXT: %19 = mul nsw i64 32, %b1
; KERNEL-IR-NEXT: %20 = add nsw i64 %19, %t1
; KERNEL-IR-NEXT: %21 = mul nsw i64 16, %polly.indvar
; KERNEL-IR-NEXT: %22 = add nsw i64 %20, %21
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
; KERNEL-IR-NEXT: %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3
; KERNEL-IR-NEXT: store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4
; KERNEL-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
; KERNEL-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 0
; KERNEL-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
; KERNEL-IR-LABEL: polly.loop_preheader: ; preds = %entry
; KERNEL-IR-NEXT: br label %polly.loop_header
; KERNEL-IR: attributes #0 = { "polly.skip.fn" }
; KERNEL-ASM: .version 3.2
; KERNEL-ASM-NEXT: .target sm_30
; KERNEL-ASM-NEXT: .address_size 64
; KERNEL-ASM: // .globl kernel_0
; KERNEL-ASM: .visible .entry kernel_0(
; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0
; KERNEL-ASM-NEXT: )
; void double_parallel_loop(float A[][1024]) {
; for (long i = 0; i < 1024; i++)
; for (long j = 0; j < 1024; j++)
; A[i][j] += i * j;
; }
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @double_parallel_loop(ptr %A) {
bb:
br label %bb2
bb2: ; preds = %bb13, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
%exitcond1 = icmp ne i64 %i.0, 1024
br i1 %exitcond1, label %bb3, label %bb15
bb3: ; preds = %bb2
br label %bb4
bb4: ; preds = %bb10, %bb3
%j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
%exitcond = icmp ne i64 %j.0, 1024
br i1 %exitcond, label %bb5, label %bb12
bb5: ; preds = %bb4
%tmp = mul nuw nsw i64 %i.0, %j.0
%tmp6 = sitofp i64 %tmp to float
%tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0
%tmp8 = load float, ptr %tmp7, align 4
%tmp9 = fadd float %tmp8, %tmp6
store float %tmp9, ptr %tmp7, align 4
br label %bb10
bb10: ; preds = %bb5
%tmp11 = add nuw nsw i64 %j.0, 1
br label %bb4
bb12: ; preds = %bb4
br label %bb13
bb13: ; preds = %bb12
%tmp14 = add nuw nsw i64 %i.0, 1
br label %bb2
bb15: ; preds = %bb2
ret void
}

View File

@ -1,57 +0,0 @@
; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS
; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
; REQUIRES: pollyacc
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
%S = type { i32, i32, [12 x %L] }
%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
define void @test(ptr %cpi, i1 %b) {
; SCOPS-LABEL: Region: %if.then14---%exit
; SCOPS: Invariant Accesses: {
; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] };
; SCOPS-NEXT: Execution Context: [l2, l1] -> { : }
; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] };
; SCOPS-NEXT: Execution Context: [l2, l1] -> { : l2 > 0 }
; SCOPS-NEXT: }
; SCOPS: Arrays {
; SCOPS-NEXT: i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4
; SCOPS-NEXT: }
; Check that we gracefully handle failing invariant loads.
; This test case is taken from:
; test/Isl/CodeGen/invariant-load-dimension.ll
; FIXME: Figure out how to actually generate code for this loop.
; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function
entry:
%nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
br i1 %b, label %if.then14, label %exit
if.then14:
%l0 = load i32, ptr %cpi, align 8
%cmp12.i = icmp sgt i32 %l0, 0
br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
for.body.lr.ph.i:
%l1 = load i32, ptr %nt, align 4
br label %for.body.i
for.body.i:
%phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
%mul.i163 = mul nsw i32 %phi, %l1
%cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
store i32 0, ptr %cv, align 8
%inc = add nuw nsw i32 %phi, 1
%l2 = load i32, ptr %cpi, align 8
%cmp.i164 = icmp slt i32 %inc, %l2
br i1 %cmp.i164, label %for.body.i, label %exit
exit:
ret void
}

View File

@ -1,41 +0,0 @@
; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
; REQUIRES: pollyacc
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
%S = type { i32, i32, [12 x %L] }
%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
define void @test(ptr %cpi, i1 %b) {
; CODEGEN-LABEL: @test(
; CODEGEN: polly.preload.begin:
; CODEGEN-NEXT: br i1 false
entry:
%nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
br i1 %b, label %if.then14, label %exit
if.then14:
%l0 = load i32, ptr %cpi, align 8
%cmp12.i = icmp sgt i32 %l0, 0
br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
for.body.lr.ph.i:
%l1 = load i32, ptr %nt, align 4
br label %for.body.i
for.body.i:
%phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
%mul.i163 = mul nsw i32 %phi, %l1
%cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
store i32 0, ptr %cv, align 8
%inc = add nuw nsw i32 %phi, 1
%l2 = load i32, ptr %cpi, align 8
%cmp.i164 = icmp slt i32 %inc, %l2
br i1 %cmp.i164, label %for.body.i, label %exit
exit:
ret void
}

View File

@ -1,176 +0,0 @@
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
; RUN: -S < %s | FileCheck %s -check-prefix=IR
; void foo(float A[2][100]) {
; for (long t = 0; t < 100; t++)
; for (long i = 1; i < 99; i++)
; A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
; }
; REQUIRES: pollyacc
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1)
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
; CODE-NEXT: dim3 k0_dimGrid(4);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
; CODE-NEXT: }
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
; ...
; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1
; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]
; IR: call i8* @polly_getKernel
; ...
; IR: call void @polly_freeKernel
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
; KERNEL-IR-LABEL: entry:
; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64
; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; KERNEL-IR-NEXT: %t0 = zext i32 %1 to i64
; KERNEL-IR-NEXT: br label %polly.cond
; KERNEL-IR-LABEL: polly.cond: ; preds = %entry
; KERNEL-IR-NEXT: %2 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %3 = add nsw i64 %2, %t0
; KERNEL-IR-NEXT: %4 = icmp sle i64 %3, 97
; KERNEL-IR-NEXT: br i1 %4, label %polly.then, label %polly.else
; KERNEL-IR-LABEL: polly.merge: ; preds = %polly.else, %polly.stmt.for.body3
; KERNEL-IR-NEXT: ret void
; KERNEL-IR-LABEL: polly.then: ; preds = %polly.cond
; KERNEL-IR-NEXT: %5 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %6 = add nsw i64 %5, %t0
; KERNEL-IR-NEXT: br label %polly.stmt.for.body3
; KERNEL-IR-LABEL: polly.stmt.for.body3: ; preds = %polly.then
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %pexp.pdiv_r = urem i64 %c0, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
; KERNEL-IR-NEXT: %7 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %8 = add nsw i64 %7, %t0
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
; KERNEL-IR-NEXT: %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %pexp.pdiv_r2 = urem i64 %c0, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
; KERNEL-IR-NEXT: %9 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %10 = add nsw i64 %9, %t0
; KERNEL-IR-NEXT: %11 = add nsw i64 %10, 1
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11
; KERNEL-IR-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
; KERNEL-IR-NEXT: %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4
; KERNEL-IR-NEXT: %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %pexp.pdiv_r7 = urem i64 %c0, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
; KERNEL-IR-NEXT: %12 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %13 = add nsw i64 %12, %t0
; KERNEL-IR-NEXT: %14 = add nsw i64 %13, 2
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14
; KERNEL-IR-NEXT: %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
; KERNEL-IR-NEXT: %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4
; KERNEL-IR-NEXT: %p_add12 = fadd float %p_add, %tmp3_p_scalar_
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %15 = add nsw i64 %c0, 1
; KERNEL-IR-NEXT: %pexp.pdiv_r12 = urem i64 %15, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
; KERNEL-IR-NEXT: %16 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %17 = add nsw i64 %16, %t0
; KERNEL-IR-NEXT: %18 = add nsw i64 %17, 1
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18
; KERNEL-IR-NEXT: %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
; KERNEL-IR-NEXT: %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4
; KERNEL-IR-NEXT: %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
; KERNEL-IR-NEXT: %19 = add nsw i64 %c0, 1
; KERNEL-IR-NEXT: %pexp.pdiv_r17 = urem i64 %19, 2
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
; KERNEL-IR-NEXT: %20 = mul nsw i64 32, %b0
; KERNEL-IR-NEXT: %21 = add nsw i64 %20, %t0
; KERNEL-IR-NEXT: %22 = add nsw i64 %21, 1
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22
; KERNEL-IR-NEXT: %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
; KERNEL-IR-NEXT: store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4
; KERNEL-IR-NEXT: br label %polly.merge
; KERNEL-IR-LABEL: polly.else: ; preds = %polly.cond
; KERNEL-IR-NEXT: br label %polly.merge
; KERNEL-IR-NEXT: }
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo([100 x float]* %A) {
entry:
br label %for.cond
for.cond: ; preds = %for.inc18, %entry
%t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
%exitcond1 = icmp ne i64 %t.0, 100
br i1 %exitcond1, label %for.body, label %for.end20
for.body: ; preds = %for.cond
br label %for.cond1
for.cond1: ; preds = %for.inc, %for.body
%i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
%exitcond = icmp ne i64 %i.0, 99
br i1 %exitcond, label %for.body3, label %for.end
for.body3: ; preds = %for.cond1
%sub = add nsw i64 %i.0, -1
%rem = srem i64 %t.0, 2
%arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
%tmp = load float, float* %arrayidx4, align 4
%rem5 = srem i64 %t.0, 2
%arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
%tmp2 = load float, float* %arrayidx7, align 4
%add = fadd float %tmp, %tmp2
%add8 = add nuw nsw i64 %i.0, 1
%rem9 = srem i64 %t.0, 2
%arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
%tmp3 = load float, float* %arrayidx11, align 4
%add12 = fadd float %add, %tmp3
%add13 = add nuw nsw i64 %t.0, 1
%rem14 = srem i64 %add13, 2
%arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
%tmp4 = load float, float* %arrayidx16, align 4
%add17 = fadd float %tmp4, %add12
store float %add17, float* %arrayidx16, align 4
br label %for.inc
for.inc: ; preds = %for.body3
%inc = add nuw nsw i64 %i.0, 1
br label %for.cond1
for.end: ; preds = %for.cond1
br label %for.inc18
for.inc18: ; preds = %for.end
%inc19 = add nuw nsw i64 %t.0, 1
br label %for.cond
for.end20: ; preds = %for.cond
ret void
}

View File

@ -1,204 +0,0 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -polly-invariant-load-hoisting=false \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -polly-invariant-load-hoisting=false \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=KERNEL-IR %s
; REQUIRES: pollyacc
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
declare void @llvm.lifetime.start(i64, ptr nocapture) #0
; This test case tests that we can correctly handle a ScopStmt that is
; scheduled on the host, instead of within a kernel.
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
; CODE-NEXT: dim3 k0_dimGrid(16);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: if (p_0 <= 510 && p_1 <= 510) {
; CODE-NEXT: {
; CODE-NEXT: dim3 k1_dimBlock(32);
; CODE-NEXT: dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: {
; CODE-NEXT: dim3 k2_dimBlock(16, 32);
; CODE-NEXT: dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
; CODE-NEXT: kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: }
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
; CODE-NEXT: Stmt_for_cond33_preheader_last();
; CODE: }
; CODE: # kernel0
; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
; CODE: # kernel1
; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
; CODE-NEXT: for (int c1 = 0; c1 <= 15; c1 += 1) {
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0)
; CODE-NEXT: Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510)
; CODE-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1)
; CODE-NEXT: Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
; CODE-NEXT: sync0();
; CODE-NEXT: }
; CODE: # kernel2
; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
; CODE-NEXT: Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
; KERNEL-IR: call void @llvm.nvvm.barrier0()
; Function Attrs: nounwind uwtable
define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %for.cond1.preheader
for.cond1.preheader: ; preds = %entry.split, %for.inc86
%indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
%indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
br label %for.inc
for.inc: ; preds = %for.cond1.preheader, %for.inc
%indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
%nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
%arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
%tmp = load double, ptr %arrayidx5, align 8, !tbaa !1
%arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
%tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1
%mul = fmul double %tmp, %tmp27
%add = fadd double %nrm.02, %mul
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp ne i64 %indvars.iv.next, 512
br i1 %exitcond, label %for.inc, label %for.end
for.end: ; preds = %for.inc
%add.lcssa = phi double [ %add, %for.inc ]
%call = tail call double @sqrt(double %add.lcssa) #2
%arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
store double %call, ptr %arrayidx13, align 8, !tbaa !1
br label %for.body16
for.cond33.preheader: ; preds = %for.body16
%indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
%cmp347 = icmp slt i64 %indvars.iv.next25, 512
br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
for.body35.lr.ph: ; preds = %for.cond33.preheader
br label %for.body35
for.body16: ; preds = %for.end, %for.body16
%indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
%arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24
%tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1
%arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
%tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1
%div = fdiv double %tmp28, %tmp29
%arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24
store double %div, ptr %arrayidx28, align 8, !tbaa !1
%indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
%exitcond12 = icmp ne i64 %indvars.iv.next11, 512
br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
for.cond33.loopexit: ; preds = %for.body62
%indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
%lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
%exitcond23 = icmp ne i32 %lftr.wideiv, 512
br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
for.body35: ; preds = %for.body35.lr.ph, %for.cond33.loopexit
%indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
%arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1
br label %for.body42
for.cond60.preheader: ; preds = %for.body42
br label %for.body62
for.body42: ; preds = %for.body35, %for.body42
%indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
%arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24
%tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1
%arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21
%tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1
%mul51 = fmul double %tmp30, %tmp31
%arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
%tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1
%add56 = fadd double %tmp32, %mul51
store double %add56, ptr %arrayidx55, align 8, !tbaa !1
%indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
%exitcond15 = icmp ne i64 %indvars.iv.next14, 512
br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
for.body62: ; preds = %for.cond60.preheader, %for.body62
%indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
%arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
%tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1
%arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24
%tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1
%arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
%tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1
%mul75 = fmul double %tmp34, %tmp35
%sub = fsub double %tmp33, %mul75
%arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
store double %sub, ptr %arrayidx79, align 8, !tbaa !1
%indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
%exitcond18 = icmp ne i64 %indvars.iv.next17, 512
br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
for.cond33.for.inc86_crit_edge: ; preds = %for.cond33.loopexit
br label %for.inc86
for.inc86: ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
%indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
%exitcond26 = icmp ne i64 %indvars.iv.next25, 512
br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
for.end88: ; preds = %for.inc86
ret void
}
; Function Attrs: argmemonly nounwind
declare void @llvm.lifetime.end(i64, ptr nocapture) #0
; Function Attrs: nounwind
declare double @sqrt(double) #2
attributes #0 = { argmemonly nounwind }
attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.ident = !{!0}
!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
!1 = !{!2, !2, i64 0}
!2 = !{!"double", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}

View File

@ -1,41 +0,0 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; REQUIRES: pollyacc
; CODE: Code
; CODE: ====
; CODE: No code generated
source_filename = "bugpoint-output-83bcdeb.bc"
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
@__data_radiation_MOD_cobi = external global [168 x double], align 32
; Function Attrs: nounwind uwtable
define void @__radiation_rg_MOD_coe_so() #0 {
entry:
%polly.access.kspec.load = load i32, ptr undef, align 4
%0 = or i1 undef, undef
br label %polly.preload.cond29
polly.preload.cond29: ; preds = %entry
br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30
polly.preload.merge30: ; preds = %polly.preload.exec31, %polly.preload.cond29
%polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ]
ret void
polly.preload.exec31: ; preds = %polly.preload.cond29
%1 = sext i32 %polly.access.kspec.load to i64
%2 = mul nsw i64 7, %1
%3 = add nsw i64 0, %2
%4 = add nsw i64 %3, 48
%polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4
%polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8
br label %polly.preload.merge30
}
attributes #0 = { nounwind uwtable }

View File

@ -1,76 +0,0 @@
; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR
; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR
; Test that we do recognise and codegen a kernel that has intrinsics.
; REQUIRES: pollyacc
; Check that we model the kernel as a scop.
; SCOP: Function: f
; SCOP-NEXT: Region: %entry.split---%for.end
; Check that the intrinsic call is present in the kernel IR.
; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
; KERNEL-IR: declare float @llvm.sqrt.f32(float)
; KERNEL-IR: declare float @llvm.fabs.f32(float)
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
; void f(float *A, float *B, int N) {
; for(int i = 0; i < N; i++) {
; float tmp0 = A[i];
; float tmp1 = sqrt(tmp1);
; float tmp2 = fabs(tmp2);
; float tmp3 = copysignf(tmp1, tmp2);
; B[i] = tmp4;
; }
; }
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @f(float* %A, float* %B, i32 %N) {
entry:
br label %entry.split
entry.split: ; preds = %entry
%cmp1 = icmp sgt i32 %N, 0
br i1 %cmp1, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry.split
br label %for.body
for.body: ; preds = %for.body.lr.ph, %for.body
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
%A.arr.i.val = load float, float* %A.arr.i, align 4
; Call to intrinsics that should be part of the kernel.
%sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
%fabs = tail call float @llvm.fabs.f32(float %sqrt);
%copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
%B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %copysign, float* %B.arr.i, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%wide.trip.count = zext i32 %N to i64
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.sqrt.f32(float) #0
declare float @llvm.fabs.f32(float) #0
declare float @llvm.copysign.f32(float, float) #0
attributes #0 = { nounwind readnone }

View File

@ -1,47 +0,0 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \
; RUN: -disable-output < %s
; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually
; fail on an illegal module.
; REQUIRES: pollyacc, asserts
; XFAIL: *
;
; void foo(long A[1024], long B[1024]) {
; for (long i = 0; i < 1024; i++)
; A[i] += (B[i] + (long)&B[i]);
; }
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo(ptr %A, ptr %B) {
bb:
br label %bb1
bb1: ; preds = %bb10, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
%exitcond = icmp ne i64 %i.0, 1024
br i1 %exitcond, label %bb2, label %bb12
bb2: ; preds = %bb1
%tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
%tmp3 = load i64, ptr %tmp, align 8
%tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
%tmp5 = ptrtoint ptr %tmp4 to i64
%tmp6 = add nsw i64 %tmp3, %tmp5
%tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
%tmp8 = load i64, ptr %tmp7, align 8
%tmp9 = add nsw i64 %tmp8, %tmp6
store i64 %tmp9, ptr %tmp7, align 8
br label %bb10
bb10: ; preds = %bb2
%tmp11 = add nuw nsw i64 %i.0, 1
br label %bb1
bb12: ; preds = %bb1
ret void
}

View File

@ -1,73 +0,0 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
; RUN: -disable-output < %s | \
; RUN: FileCheck -check-prefix=CODE %s
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -disable-output < %s | \
; RUN: not FileCheck %s -check-prefix=KERNEL-IR
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
; RUN: FileCheck %s -check-prefix=IR
; REQUIRES: pollyacc
;
; void foo(long A[1024], long B[1024]) {
; for (long i = 0; i < 1024; i++)
; A[i] += (B[i] + (long)&B[i]);
; }
; This kernel loads/stores a pointer address we model. This is a rare case,
; were we still lack proper code-generation support. We check here that we
; detect the invalid IR and bail out gracefully.
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
; CODE-NEXT: dim3 k0_dimGrid(32);
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A);
; CODE-NEXT: cudaCheckKernel();
; CODE-NEXT: }
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
; CODE: # kernel0
; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
; RUN: FileCheck %s -check-prefix=IR
; KERNEL-IR: kernel
; IR: br i1 false, label %polly.start, label %bb1
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @foo(ptr %A, ptr %B) {
bb:
br label %bb1
bb1: ; preds = %bb10, %bb
%i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
%exitcond = icmp ne i64 %i.0, 1024
br i1 %exitcond, label %bb2, label %bb12
bb2: ; preds = %bb1
%tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
%tmp3 = load i64, ptr %tmp, align 8
%tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
%tmp5 = ptrtoint ptr %tmp4 to i64
%tmp6 = add nsw i64 %tmp3, %tmp5
%tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
%tmp8 = load i64, ptr %tmp7, align 8
%tmp9 = add nsw i64 %tmp8, %tmp6
store i64 %tmp9, ptr %tmp7, align 8
br label %bb10
bb10: ; preds = %bb2
%tmp11 = add nuw nsw i64 %i.0, 1
br label %bb1
bb12: ; preds = %bb1
ret void
}

View File

@ -1,70 +0,0 @@
; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
; REQUIRES: pollyacc
; Check that we detect a scop.
; SCOP: Function: f
; SCOP-NEXT: Region: %for.body---%for.end
; SCOP-NEXT: Max Loop Depth: 1
; SCOP-NEXT: Invariant Accesses: {
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOP-NEXT: [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] };
; SCOP-NEXT: Execution Context: [tmp] -> { : }
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOP-NEXT: [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] };
; SCOP-NEXT: Execution Context: [tmp] -> { : tmp >= 4 }
; SCOP-NEXT: }
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
; This test makes sure that such an access pattern is handled correctly
; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
; was the main reason that caused this test case to crash.
;
; void f(int *arr, const int *control, const int *readarr) {
; for(int i = 0; i < 1000; i++) {
; int t = 0;
; if (*control > 3) {
; t += *readarr;
; }
; arr[i] = t;
; }
; }
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.12.0"
define void @f(ptr %arr, ptr %control, ptr %readarr) {
entry:
br label %entry.split
entry.split: ; preds = %entry
br label %for.body
for.body: ; preds = %entry.split, %if.end
%i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ]
%tmp = load i32, ptr %control, align 4
%cmp1 = icmp sgt i32 %tmp, 3
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%tmp1 = load i32, ptr %readarr, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ]
%arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01
store i32 %t.0, ptr %arrayidx, align 4
%inc = add nuw nsw i32 %i.01, 1
%exitcond = icmp eq i32 %inc, 1000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %if.end
ret void
}

Some files were not shown because too many files have changed in this diff Show More