[Polly] Remove Polly-ACC.
Polly-ACC is unmaintained and since it has never been ported to the NPM pipeline, since D136621 it is not even accessible anymore without manually specifying the passes on the `opt` command line. Since there is no plan to put it to a maintainable state, remove it from Polly. Reviewed By: grosser Differential Revision: https://reviews.llvm.org/D142580
This commit is contained in:
parent
115c7beda7
commit
19afbfe331
@ -85,31 +85,6 @@ endif ()
|
||||
|
||||
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
|
||||
|
||||
option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
|
||||
set(GPU_CODEGEN FALSE)
|
||||
if (POLLY_ENABLE_GPGPU_CODEGEN)
|
||||
# Do not require CUDA/OpenCL, as GPU code generation test cases can be run
|
||||
# without a CUDA/OpenCL library.
|
||||
if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
|
||||
FIND_PACKAGE(CUDA)
|
||||
FIND_PACKAGE(OpenCL)
|
||||
set(GPU_CODEGEN TRUE)
|
||||
else()
|
||||
message(WARNING "The LLVM NVPTX target is required for GPU code generation")
|
||||
endif()
|
||||
endif(POLLY_ENABLE_GPGPU_CODEGEN)
|
||||
|
||||
|
||||
# Support GPGPU code generation if the library is available.
|
||||
if (CUDA_FOUND)
|
||||
add_definitions(-DHAS_LIBCUDART)
|
||||
INCLUDE_DIRECTORIES( ${CUDA_INCLUDE_DIRS} )
|
||||
endif(CUDA_FOUND)
|
||||
if (OpenCL_FOUND)
|
||||
add_definitions(-DHAS_LIBOPENCL)
|
||||
INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
|
||||
endif(OpenCL_FOUND)
|
||||
|
||||
option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
|
||||
if (NOT POLLY_BUNDLED_ISL)
|
||||
find_package(ISL MODULE REQUIRED)
|
||||
@ -155,7 +130,6 @@ add_subdirectory(test)
|
||||
if (POLLY_GTEST_AVAIL)
|
||||
add_subdirectory(unittests)
|
||||
endif ()
|
||||
add_subdirectory(tools)
|
||||
add_subdirectory(cmake)
|
||||
# TODO: docs.
|
||||
|
||||
|
||||
@ -27,9 +27,6 @@ if (NOT WIN32 AND LLVM_ENABLE_PIC)
|
||||
# LLVMPolly is a dummy target on Win or if PIC code is disabled.
|
||||
list(APPEND POLLY_CONFIG_EXPORTED_TARGETS LLVMPolly)
|
||||
endif()
|
||||
if (POLLY_ENABLE_GPGPU_CODEGEN)
|
||||
list(APPEND POLLY_CONFIG_EXPORTED_TARGETS PollyPPCG)
|
||||
endif()
|
||||
|
||||
# Get the target type for every exported target
|
||||
foreach(tgt IN LISTS POLLY_CONFIG_EXPORTED_TARGETS)
|
||||
|
||||
@ -8,7 +8,6 @@ find_package(LLVM ${LLVM_VERSION} EXACT REQUIRED CONFIG
|
||||
|
||||
set(Polly_CMAKE_DIR ${CMAKE_CURRENT_LIST_DIR})
|
||||
set(Polly_BUNDLED_ISL @POLLY_BUNDLED_ISL@)
|
||||
set(Polly_ENABLE_GPGPU_CODEGEN @POLLY_ENABLE_GPGPU_CODEGEN@)
|
||||
|
||||
set(Polly_DEFINITIONS ${LLVM_DEFINITIONS})
|
||||
set(Polly_INCLUDE_DIRS @POLLY_CONFIG_INCLUDE_DIRS@ ${LLVM_INCLUDE_DIRS})
|
||||
@ -19,17 +18,9 @@ set(Polly_LIBRARIES ${LLVM_LIBRARIES} ${Polly_EXPORTED_TARGETS})
|
||||
# Imported Targets:
|
||||
@ISL_CONFIG_CODE@
|
||||
|
||||
if (Polly_ENABLE_GPGPU_CODEGEN AND NOT TARGET PollyPPCG)
|
||||
add_library(PollyPPCG @POLLY_CONFIG_TARGET_PollyPPCG_TYPE@ IMPORTED)
|
||||
set_property(TARGET PollyPPCG PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
|
||||
endif()
|
||||
|
||||
if (NOT TARGET Polly)
|
||||
add_library(Polly @POLLY_CONFIG_TARGET_Polly_TYPE@ IMPORTED)
|
||||
set_property(TARGET Polly PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
|
||||
if (Polly_ENABLE_GPGPU_CODEGEN)
|
||||
set_property(TARGET Polly APPEND PROPERTY INTERFACE_LINK_LIBRARIES PollyPPCG)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT TARGET LLVMPolly)
|
||||
|
||||
@ -21,3 +21,5 @@ In Polly |version| the following important changes have been incorporated.
|
||||
In the future we hope that Polly can collaborate better with LoopVectorize,
|
||||
like Polly marking a loop is safe to vectorize with a specific simd width,
|
||||
instead of replicating its functionality.
|
||||
|
||||
- Polly-ACC has been removed.
|
||||
|
||||
@ -1,33 +0,0 @@
|
||||
//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Take a scop created by ScopInfo and map it to GPU code using the ppcg
|
||||
// GPU mapping strategy.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef POLLY_PPCGCODEGENERATION_H
|
||||
#define POLLY_PPCGCODEGENERATION_H
|
||||
|
||||
/// The GPU Architecture to target.
|
||||
enum GPUArch { NVPTX64, SPIR32, SPIR64 };
|
||||
|
||||
/// The GPU Runtime implementation to use.
|
||||
enum GPURuntime { CUDA, OpenCL };
|
||||
|
||||
namespace polly {
|
||||
extern bool PollyManagedMemory;
|
||||
|
||||
/// Use for pass instantiation defaults.
|
||||
/// @{
|
||||
extern GPURuntime GPURuntimeChoice;
|
||||
extern GPUArch GPUArchChoice;
|
||||
/// @}
|
||||
} // namespace polly
|
||||
|
||||
#endif // POLLY_PPCGCODEGENERATION_H
|
||||
@ -30,24 +30,20 @@ namespace polly {
|
||||
struct RuntimeDebugBuilder {
|
||||
|
||||
/// Generate a constant string into the builder's llvm::Module which can be
|
||||
/// passed to createGPUPrinter() or createGPUPrinter().
|
||||
/// passed to createCPUPrinter().
|
||||
///
|
||||
/// @param Builder The builder used to emit the printer calls.
|
||||
/// @param Str The string to be printed.
|
||||
|
||||
/// @return A global containing @p Str.
|
||||
static llvm::Value *getPrintableString(PollyIRBuilder &Builder,
|
||||
llvm::StringRef Str) {
|
||||
// TODO: Get rid of magic number 4. It it NVPTX's constant address space and
|
||||
// works on X86 (CPU) only because its backend ignores the address space.
|
||||
return Builder.CreateGlobalStringPtr(Str, "", 4);
|
||||
}
|
||||
llvm::StringRef Str);
|
||||
|
||||
/// Return whether an llvm::Value of the type @p Ty is printable for
|
||||
/// debugging.
|
||||
///
|
||||
/// That is, whether such a value can be passed to createGPUPrinter() or
|
||||
/// createGPUPrinter() to be dumped as runtime. If false is returned, those
|
||||
/// That is, whether such a value can be passed to createGPUPrinter()
|
||||
/// to be dumped as runtime. If false is returned, those
|
||||
/// functions will fail.
|
||||
static bool isPrintable(llvm::Type *Ty);
|
||||
|
||||
@ -64,62 +60,41 @@ struct RuntimeDebugBuilder {
|
||||
template <typename... Args>
|
||||
static void createCPUPrinter(PollyIRBuilder &Builder, Args... args) {
|
||||
std::vector<llvm::Value *> Vector;
|
||||
createPrinter(Builder, /* CPU */ false, Vector, args...);
|
||||
}
|
||||
|
||||
/// Print a set of LLVM-IR Values or StringRefs on an NVIDIA GPU.
|
||||
///
|
||||
/// This function emits a call to vprintf that will print the given
|
||||
/// arguments from within a kernel thread. It is useful for debugging
|
||||
/// CUDA program kernels. All arguments given in this list will be
|
||||
/// automatically concatenated and the resulting string will be printed
|
||||
/// atomically. We also support ArrayRef arguments, which can be used to
|
||||
/// provide for example a list of thread-id values.
|
||||
///
|
||||
/// @param Builder The builder used to emit the printer calls.
|
||||
/// @param Args The list of values to print.
|
||||
template <typename... Args>
|
||||
static void createGPUPrinter(PollyIRBuilder &Builder, Args... args) {
|
||||
std::vector<llvm::Value *> Vector;
|
||||
createPrinter(Builder, /* GPU */ true, Vector, args...);
|
||||
createPrinter(Builder, Vector, args...);
|
||||
}
|
||||
|
||||
private:
|
||||
/// Handle Values.
|
||||
template <typename... Args>
|
||||
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
|
||||
static void createPrinter(PollyIRBuilder &Builder,
|
||||
std::vector<llvm::Value *> &Values,
|
||||
llvm::Value *Value, Args... args) {
|
||||
Values.push_back(Value);
|
||||
createPrinter(Builder, UseGPU, Values, args...);
|
||||
createPrinter(Builder, Values, args...);
|
||||
}
|
||||
|
||||
/// Handle StringRefs.
|
||||
template <typename... Args>
|
||||
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
|
||||
static void createPrinter(PollyIRBuilder &Builder,
|
||||
std::vector<llvm::Value *> &Values,
|
||||
llvm::StringRef String, Args... args) {
|
||||
Values.push_back(getPrintableString(Builder, String));
|
||||
createPrinter(Builder, UseGPU, Values, args...);
|
||||
createPrinter(Builder, Values, args...);
|
||||
}
|
||||
|
||||
/// Handle ArrayRefs.
|
||||
template <typename... Args>
|
||||
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
|
||||
static void createPrinter(PollyIRBuilder &Builder,
|
||||
std::vector<llvm::Value *> &Values,
|
||||
llvm::ArrayRef<llvm::Value *> Array, Args... args) {
|
||||
Values.insert(Values.end(), Array.begin(), Array.end());
|
||||
createPrinter(Builder, UseGPU, Values, args...);
|
||||
createPrinter(Builder, Values, args...);
|
||||
}
|
||||
|
||||
/// Print a list of Values.
|
||||
static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
|
||||
static void createPrinter(PollyIRBuilder &Builder,
|
||||
llvm::ArrayRef<llvm::Value *> Values);
|
||||
|
||||
/// Print a list of Values on a GPU.
|
||||
static void createGPUPrinterT(PollyIRBuilder &Builder,
|
||||
llvm::ArrayRef<llvm::Value *> Values);
|
||||
|
||||
/// Print a list of Values on a CPU.
|
||||
static void createCPUPrinterT(PollyIRBuilder &Builder,
|
||||
llvm::ArrayRef<llvm::Value *> Values);
|
||||
@ -145,22 +120,6 @@ private:
|
||||
///
|
||||
/// @parma Builder The builder used to insert the code.
|
||||
static void createFlush(PollyIRBuilder &Builder);
|
||||
|
||||
/// Get (and possibly insert) a NVIDIA address space cast call.
|
||||
static llvm::Function *getAddressSpaceCast(PollyIRBuilder &Builder,
|
||||
unsigned Src, unsigned Dst,
|
||||
unsigned SrcBits = 8,
|
||||
unsigned DstBits = 8);
|
||||
|
||||
/// Get identifiers that describe the currently executed GPU thread.
|
||||
///
|
||||
/// The result will be a vector that if passed to the GPU printer will result
|
||||
/// into a string (initialized to values corresponding to the printing
|
||||
/// thread):
|
||||
///
|
||||
/// "> block-id: bidx bid1y bidz | thread-id: tidx tidy tidz "
|
||||
static std::vector<llvm::Value *>
|
||||
getGPUThreadIdentifiers(PollyIRBuilder &Builder);
|
||||
};
|
||||
} // namespace polly
|
||||
|
||||
|
||||
@ -12,7 +12,4 @@
|
||||
#ifndef POLLY_CONFIG_H
|
||||
#define POLLY_CONFIG_H
|
||||
|
||||
#cmakedefine CUDA_FOUND
|
||||
#cmakedefine GPU_CODEGEN
|
||||
|
||||
#endif
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
#ifndef POLLY_LINKALLPASSES_H
|
||||
#define POLLY_LINKALLPASSES_H
|
||||
|
||||
#include "polly/CodeGen/PPCGCodeGeneration.h"
|
||||
#include "polly/Config/config.h"
|
||||
#include "polly/Support/DumpFunctionPass.h"
|
||||
#include "polly/Support/DumpModulePass.h"
|
||||
@ -54,14 +53,6 @@ llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
|
||||
llvm::Pass *createIslAstInfoWrapperPassPass();
|
||||
llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
|
||||
llvm::Pass *createCodeGenerationPass();
|
||||
#ifdef GPU_CODEGEN
|
||||
llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
|
||||
GPURuntime Runtime = GPURuntime::CUDA);
|
||||
|
||||
llvm::Pass *
|
||||
createManagedMemoryRewritePassPass(GPUArch Arch = GPUArch::NVPTX64,
|
||||
GPURuntime Runtime = GPURuntime::CUDA);
|
||||
#endif
|
||||
llvm::Pass *createIslScheduleOptimizerWrapperPass();
|
||||
llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
|
||||
llvm::Pass *createFlattenSchedulePass();
|
||||
@ -113,10 +104,6 @@ struct PollyForcePassLinking {
|
||||
polly::createIslAstInfoWrapperPassPass();
|
||||
polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
|
||||
polly::createCodeGenerationPass();
|
||||
#ifdef GPU_CODEGEN
|
||||
polly::createPPCGCodeGenerationPass();
|
||||
polly::createManagedMemoryRewritePassPass();
|
||||
#endif
|
||||
polly::createIslScheduleOptimizerWrapperPass();
|
||||
polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs());
|
||||
polly::createMaximalStaticExpansionPass();
|
||||
@ -156,10 +143,6 @@ void initializeDependenceInfoPrinterLegacyFunctionPassPass(
|
||||
void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
|
||||
void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
|
||||
void initializeCodeGenerationPass(llvm::PassRegistry &);
|
||||
#ifdef GPU_CODEGEN
|
||||
void initializePPCGCodeGenerationPass(llvm::PassRegistry &);
|
||||
void initializeManagedMemoryRewritePassPass(llvm::PassRegistry &);
|
||||
#endif
|
||||
void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
|
||||
void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
|
||||
void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &);
|
||||
|
||||
@ -1684,9 +1684,6 @@ private:
|
||||
/// Number of copy statements.
|
||||
unsigned CopyStmtsNum = 0;
|
||||
|
||||
/// Flag to indicate if the Scop is to be skipped.
|
||||
bool SkipScop = false;
|
||||
|
||||
using StmtSet = std::list<ScopStmt>;
|
||||
|
||||
/// The statements in this Scop.
|
||||
@ -2144,12 +2141,6 @@ public:
|
||||
/// Check if the SCoP has been optimized by the scheduler.
|
||||
bool isOptimized() const { return IsOptimized; }
|
||||
|
||||
/// Mark the SCoP to be skipped by ScopPass passes.
|
||||
void markAsToBeSkipped() { SkipScop = true; }
|
||||
|
||||
/// Check if the SCoP is to be skipped by ScopPass passes.
|
||||
bool isToBeSkipped() const { return SkipScop; }
|
||||
|
||||
/// Return the ID of the Scop
|
||||
int getID() const { return ID; }
|
||||
|
||||
|
||||
@ -1,42 +0,0 @@
|
||||
//===- Support/LinkGPURuntime.h -- Headerfile to help force-link GPURuntime =//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This header helps pull in libGPURuntime.so
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef POLLY_LINK_GPURUNTIME
|
||||
#define POLLY_LINK_GPURUNTIME
|
||||
|
||||
extern "C" {
|
||||
#include "GPURuntime/GPUJIT.h"
|
||||
}
|
||||
|
||||
namespace polly {
|
||||
struct ForceGPURuntimeLinking {
|
||||
ForceGPURuntimeLinking() {
|
||||
if (std::getenv("bar") != (char *)-1)
|
||||
return;
|
||||
// We must reference GPURuntime in such a way that compilers will not
|
||||
// delete it all as dead code, even with whole program optimization,
|
||||
// yet is effectively a NO-OP. As the compiler isn't smart enough
|
||||
// to know that getenv() never returns -1, this will do the job.
|
||||
polly_initContextCL();
|
||||
polly_initContextCUDA();
|
||||
polly_getKernel(nullptr, nullptr);
|
||||
polly_freeKernel(nullptr);
|
||||
polly_copyFromHostToDevice(nullptr, nullptr, 0);
|
||||
polly_copyFromDeviceToHost(nullptr, nullptr, 0);
|
||||
polly_synchronizeDevice();
|
||||
polly_launchKernel(nullptr, 0, 0, 0, 0, 0, nullptr);
|
||||
polly_freeDeviceMemory(nullptr);
|
||||
polly_freeContext(nullptr);
|
||||
polly_synchronizeDevice();
|
||||
}
|
||||
} structure;
|
||||
} // namespace polly
|
||||
#endif
|
||||
@ -6,13 +6,6 @@ set(ISL_CODEGEN_FILES
|
||||
CodeGen/IslNodeBuilder.cpp
|
||||
CodeGen/CodeGeneration.cpp)
|
||||
|
||||
if (GPU_CODEGEN)
|
||||
set (GPGPU_CODEGEN_FILES
|
||||
CodeGen/PPCGCodeGeneration.cpp
|
||||
CodeGen/ManagedMemoryRewrite.cpp
|
||||
)
|
||||
endif (GPU_CODEGEN)
|
||||
|
||||
# Compile ISL into a separate library.
|
||||
add_subdirectory(External)
|
||||
|
||||
@ -44,12 +37,6 @@ set(POLLY_COMPONENTS
|
||||
Vectorize
|
||||
)
|
||||
|
||||
# Polly-ACC requires the NVPTX backend to work. Ask LLVM about its libraries.
|
||||
if (GPU_CODEGEN)
|
||||
# This call emits an error if they NVPTX backend is not enable.
|
||||
list(APPEND POLLY_COMPONENTS NVPTX)
|
||||
endif ()
|
||||
|
||||
# Use an object-library to add the same files to multiple libs without requiring
|
||||
# the sources them to be recompiled for each of them.
|
||||
add_llvm_pass_plugin(Polly
|
||||
@ -73,7 +60,6 @@ add_llvm_pass_plugin(Polly
|
||||
CodeGen/Utils.cpp
|
||||
CodeGen/RuntimeDebugBuilder.cpp
|
||||
CodeGen/PerfMonitor.cpp
|
||||
${GPGPU_CODEGEN_FILES}
|
||||
Exchange/JSONExporter.cpp
|
||||
Support/GICHelper.cpp
|
||||
Support/SCEVAffinator.cpp
|
||||
@ -127,16 +113,6 @@ target_link_libraries(Polly PUBLIC
|
||||
${ISL_TARGET}
|
||||
)
|
||||
|
||||
# Additional dependencies for Polly-ACC.
|
||||
if (GPU_CODEGEN)
|
||||
target_link_libraries(Polly PUBLIC PollyPPCG)
|
||||
endif ()
|
||||
|
||||
if (NOT LLVM_LINK_LLVM_DYLIB AND NOT LLVM_POLLY_LINK_INTO_TOOLS)
|
||||
# Polly-ACC requires the NVPTX target to be present in the executable it is linked to
|
||||
set_property(TARGET bugpoint APPEND PROPERTY LINK_LIBRARIES LLVMTarget)
|
||||
endif ()
|
||||
|
||||
# Create a loadable module Polly.so that can be loaded using
|
||||
# LLVM's/clang's "-load" option.
|
||||
if (WIN32 OR NOT LLVM_ENABLE_PIC)
|
||||
@ -150,19 +126,6 @@ else ()
|
||||
$<TARGET_OBJECTS:obj.Polly>
|
||||
)
|
||||
|
||||
# Only add the dependencies that are not part of LLVM. The latter are assumed
|
||||
# to be already available in the address space the module is loaded into.
|
||||
# Adding them once more would have the effect that both copies try to register
|
||||
# the same command line options, to which LLVM reacts with an error.
|
||||
# If Polly-ACC is enabled, the NVPTX target is also expected to reside in the
|
||||
# hosts. This is not the case for bugpoint. Use LLVM_POLLY_LINK_INTO_TOOLS=ON
|
||||
# instead which will automatically resolve the additional dependencies by
|
||||
# Polly.
|
||||
target_link_libraries(LLVMPolly PUBLIC ${ISL_TARGET})
|
||||
if (GPU_CODEGEN)
|
||||
target_link_libraries(LLVMPolly PUBLIC PollyPPCG)
|
||||
endif ()
|
||||
|
||||
set_target_properties(LLVMPolly
|
||||
PROPERTIES
|
||||
LINKER_LANGUAGE CXX
|
||||
|
||||
@ -238,14 +238,8 @@ void BlockGenerator::copyInstScalar(ScopStmt &Stmt, Instruction *Inst,
|
||||
Builder.Insert(NewInst);
|
||||
BBMap[Inst] = NewInst;
|
||||
|
||||
// When copying the instruction onto the Module meant for the GPU,
|
||||
// debug metadata attached to an instruction causes all related
|
||||
// metadata to be pulled into the Module. This includes the DICompileUnit,
|
||||
// which will not be listed in llvm.dbg.cu of the Module since the Module
|
||||
// doesn't contain one. This fails the verification of the Module and the
|
||||
// subsequent generation of the ASM string.
|
||||
if (NewInst->getModule() != Inst->getModule())
|
||||
NewInst->setDebugLoc(llvm::DebugLoc());
|
||||
assert(NewInst->getModule() == Inst->getModule() &&
|
||||
"Expecting instructions to be in the same module");
|
||||
|
||||
if (!NewInst->getType()->isVoidTy())
|
||||
NewInst->setName("p_" + Inst->getName());
|
||||
|
||||
@ -323,10 +323,6 @@ public:
|
||||
|
||||
/// Generate LLVM-IR for the SCoP @p S.
|
||||
bool runOnScop(Scop &S) override {
|
||||
// Skip SCoPs in case they're already code-generated by PPCGCodeGeneration.
|
||||
if (S.isToBeSkipped())
|
||||
return false;
|
||||
|
||||
AI = &getAnalysis<IslAstInfoWrapperPass>().getAI();
|
||||
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
|
||||
@ -638,10 +638,6 @@ isl::ast_build IslAstInfo::getBuild(const isl::ast_node &Node) {
|
||||
static std::unique_ptr<IslAstInfo> runIslAst(
|
||||
Scop &Scop,
|
||||
function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps) {
|
||||
// Skip SCoPs in case they're already handled by PPCGCodeGeneration.
|
||||
if (Scop.isToBeSkipped())
|
||||
return {};
|
||||
|
||||
ScopsProcessed++;
|
||||
|
||||
const Dependences &D = GetDeps(Dependences::AL_Statement);
|
||||
|
||||
@ -1,427 +0,0 @@
|
||||
//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Take a module and rewrite:
|
||||
// 1. `malloc` -> `polly_mallocManaged`
|
||||
// 2. `free` -> `polly_freeManaged`
|
||||
// 3. global arrays with initializers -> global arrays that are initialized
|
||||
// with a constructor call to
|
||||
// `polly_mallocManaged`.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "polly/CodeGen/IRBuilder.h"
|
||||
#include "polly/CodeGen/PPCGCodeGeneration.h"
|
||||
#include "polly/DependenceInfo.h"
|
||||
#include "polly/LinkAllPasses.h"
|
||||
#include "polly/Options.h"
|
||||
#include "polly/ScopDetection.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/Analysis/CaptureTracking.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Transforms/Utils/ModuleUtils.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace polly;
|
||||
|
||||
static cl::opt<bool> RewriteAllocas(
|
||||
"polly-acc-rewrite-allocas",
|
||||
cl::desc(
|
||||
"Ask the managed memory rewriter to also rewrite alloca instructions"),
|
||||
cl::Hidden, cl::cat(PollyCategory));
|
||||
|
||||
static cl::opt<bool> IgnoreLinkageForGlobals(
|
||||
"polly-acc-rewrite-ignore-linkage-for-globals",
|
||||
cl::desc(
|
||||
"By default, we only rewrite globals with internal linkage. This flag "
|
||||
"enables rewriting of globals regardless of linkage"),
|
||||
cl::Hidden, cl::cat(PollyCategory));
|
||||
|
||||
#define DEBUG_TYPE "polly-acc-rewrite-managed-memory"
|
||||
namespace {
|
||||
|
||||
static llvm::Function *getOrCreatePollyMallocManaged(Module &M) {
|
||||
const char *Name = "polly_mallocManaged";
|
||||
Function *F = M.getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
PollyIRBuilder Builder(M.getContext());
|
||||
// TODO: How do I get `size_t`? I assume from DataLayout?
|
||||
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(),
|
||||
{Builder.getInt64Ty()}, false);
|
||||
F = Function::Create(Ty, Linkage, Name, &M);
|
||||
}
|
||||
|
||||
return F;
|
||||
}
|
||||
|
||||
static llvm::Function *getOrCreatePollyFreeManaged(Module &M) {
|
||||
const char *Name = "polly_freeManaged";
|
||||
Function *F = M.getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
PollyIRBuilder Builder(M.getContext());
|
||||
// TODO: How do I get `size_t`? I assume from DataLayout?
|
||||
FunctionType *Ty =
|
||||
FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);
|
||||
F = Function::Create(Ty, Linkage, Name, &M);
|
||||
}
|
||||
|
||||
return F;
|
||||
}
|
||||
|
||||
// Expand a constant expression `Cur`, which is used at instruction `Parent`
|
||||
// at index `index`.
|
||||
// Since a constant expression can expand to multiple instructions, store all
|
||||
// the expands into a set called `Expands`.
|
||||
// Note that this goes inorder on the constant expression tree.
|
||||
// A * ((B * D) + C)
|
||||
// will be processed with first A, then B * D, then B, then D, and then C.
|
||||
// Though ConstantExprs are not treated as "trees" but as DAGs, since you can
|
||||
// have something like this:
|
||||
// *
|
||||
// / \
|
||||
// \ /
|
||||
// (D)
|
||||
//
|
||||
// For the purposes of this expansion, we expand the two occurences of D
|
||||
// separately. Therefore, we expand the DAG into the tree:
|
||||
// *
|
||||
// / \
|
||||
// D D
|
||||
// TODO: We don't _have_to do this, but this is the simplest solution.
|
||||
// We can write a solution that keeps track of which constants have been
|
||||
// already expanded.
|
||||
static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder,
|
||||
Instruction *Parent, int index,
|
||||
SmallPtrSet<Instruction *, 4> &Expands) {
|
||||
assert(Cur && "invalid constant expression passed");
|
||||
Instruction *I = Cur->getAsInstruction();
|
||||
assert(I && "unable to convert ConstantExpr to Instruction");
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur
|
||||
<< ") in Instruction: (" << *I << ")\n";);
|
||||
|
||||
// Invalidate `Cur` so that no one after this point uses `Cur`. Rather,
|
||||
// they should mutate `I`.
|
||||
Cur = nullptr;
|
||||
|
||||
Expands.insert(I);
|
||||
Parent->setOperand(index, I);
|
||||
|
||||
// The things that `Parent` uses (its operands) should be created
|
||||
// before `Parent`.
|
||||
Builder.SetInsertPoint(Parent);
|
||||
Builder.Insert(I);
|
||||
|
||||
for (unsigned i = 0; i < I->getNumOperands(); i++) {
|
||||
Value *Op = I->getOperand(i);
|
||||
assert(isa<Constant>(Op) && "constant must have a constant operand");
|
||||
|
||||
if (ConstantExpr *CExprOp = dyn_cast<ConstantExpr>(Op))
|
||||
expandConstantExpr(CExprOp, Builder, I, i, Expands);
|
||||
}
|
||||
}
|
||||
|
||||
// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite
|
||||
// `ConstantExpr`s that are used in the `Inst`.
|
||||
// Note that `replaceAllUsesWith` is insufficient for this purpose because it
|
||||
// does not rewrite values in `ConstantExpr`s.
|
||||
static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal,
|
||||
PollyIRBuilder &Builder) {
|
||||
|
||||
// This contains a set of instructions in which OldVal must be replaced.
|
||||
// We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s
|
||||
// from `Inst`s arguments.
|
||||
// We need to go through this process because `replaceAllUsesWith` does not
|
||||
// actually edit `ConstantExpr`s.
|
||||
SmallPtrSet<Instruction *, 4> InstsToVisit = {Inst};
|
||||
|
||||
// Expand all `ConstantExpr`s and place it in `InstsToVisit`.
|
||||
for (unsigned i = 0; i < Inst->getNumOperands(); i++) {
|
||||
Value *Operand = Inst->getOperand(i);
|
||||
if (ConstantExpr *ValueConstExpr = dyn_cast<ConstantExpr>(Operand))
|
||||
expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit);
|
||||
}
|
||||
|
||||
// Now visit each instruction and use `replaceUsesOfWith`. We know that
|
||||
// will work because `I` cannot have any `ConstantExpr` within it.
|
||||
for (Instruction *I : InstsToVisit)
|
||||
I->replaceUsesOfWith(OldVal, NewVal);
|
||||
}
|
||||
|
||||
// Given a value `Current`, return all Instructions that may contain `Current`
|
||||
// in an expression.
|
||||
// We need this auxiliary function, because if we have a
|
||||
// `Constant` that is a user of `V`, we need to recurse into the
|
||||
// `Constant`s uses to gather the root instruction.
|
||||
static void getInstructionUsersOfValue(Value *V,
|
||||
SmallVector<Instruction *, 4> &Owners) {
|
||||
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||
Owners.push_back(I);
|
||||
} else {
|
||||
// Anything that is a `User` must be a constant or an instruction.
|
||||
auto *C = cast<Constant>(V);
|
||||
for (Use &CUse : C->uses())
|
||||
getInstructionUsersOfValue(CUse.getUser(), Owners);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array,
|
||||
SmallPtrSet<GlobalVariable *, 4> &ReplacedGlobals) {
|
||||
// We only want arrays.
|
||||
ArrayType *ArrayTy = dyn_cast<ArrayType>(Array.getValueType());
|
||||
if (!ArrayTy)
|
||||
return;
|
||||
Type *ElemTy = ArrayTy->getElementType();
|
||||
PointerType *ElemPtrTy = ElemTy->getPointerTo();
|
||||
|
||||
// We only wish to replace arrays that are visible in the module they
|
||||
// inhabit. Otherwise, our type edit from [T] to T* would be illegal across
|
||||
// modules.
|
||||
const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() ||
|
||||
Array.hasInternalLinkage() ||
|
||||
IgnoreLinkageForGlobals;
|
||||
if (!OnlyVisibleInsideModule) {
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "Not rewriting (" << Array
|
||||
<< ") to managed memory "
|
||||
"because it could be visible externally. To force rewrite, "
|
||||
"use -polly-acc-rewrite-ignore-linkage-for-globals.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!Array.hasInitializer() ||
|
||||
!isa<ConstantAggregateZero>(Array.getInitializer())) {
|
||||
LLVM_DEBUG(dbgs() << "Not rewriting (" << Array
|
||||
<< ") to managed memory "
|
||||
"because it has an initializer which is "
|
||||
"not a zeroinitializer.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we have committed to replacing this array.
|
||||
ReplacedGlobals.insert(&Array);
|
||||
|
||||
std::string NewName = Array.getName().str();
|
||||
NewName += ".toptr";
|
||||
GlobalVariable *ReplacementToArr =
|
||||
cast<GlobalVariable>(M.getOrInsertGlobal(NewName, ElemPtrTy));
|
||||
ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy));
|
||||
|
||||
Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
|
||||
std::string FnName = Array.getName().str();
|
||||
FnName += ".constructor";
|
||||
PollyIRBuilder Builder(M.getContext());
|
||||
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
|
||||
const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
Function *F = Function::Create(Ty, Linkage, FnName, &M);
|
||||
BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F);
|
||||
Builder.SetInsertPoint(Start);
|
||||
|
||||
const uint64_t ArraySizeInt = DL.getTypeAllocSize(ArrayTy);
|
||||
Value *ArraySize = Builder.getInt64(ArraySizeInt);
|
||||
ArraySize->setName("array.size");
|
||||
|
||||
Value *AllocatedMemRaw =
|
||||
Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw");
|
||||
Value *AllocatedMemTyped =
|
||||
Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed");
|
||||
Builder.CreateStore(AllocatedMemTyped, ReplacementToArr);
|
||||
Builder.CreateRetVoid();
|
||||
|
||||
const int Priority = 0;
|
||||
appendToGlobalCtors(M, F, Priority, ReplacementToArr);
|
||||
|
||||
SmallVector<Instruction *, 4> ArrayUserInstructions;
|
||||
// Get all instructions that use array. We need to do this weird thing
|
||||
// because `Constant`s that contain this array neeed to be expanded into
|
||||
// instructions so that we can replace their parameters. `Constant`s cannot
|
||||
// be edited easily, so we choose to convert all `Constant`s to
|
||||
// `Instruction`s and handle all of the uses of `Array` uniformly.
|
||||
for (Use &ArrayUse : Array.uses())
|
||||
getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions);
|
||||
|
||||
for (Instruction *UserOfArrayInst : ArrayUserInstructions) {
|
||||
|
||||
Builder.SetInsertPoint(UserOfArrayInst);
|
||||
// <ty>** -> <ty>*
|
||||
Value *ArrPtrLoaded =
|
||||
Builder.CreateLoad(ElemPtrTy, ReplacementToArr, "arrptr.load");
|
||||
// <ty>* -> [ty]*
|
||||
Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast(
|
||||
ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast");
|
||||
rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder);
|
||||
}
|
||||
}
|
||||
|
||||
// We return all `allocas` that may need to be converted to a call to
|
||||
// cudaMallocManaged.
|
||||
static void getAllocasToBeManaged(Function &F,
|
||||
SmallSet<AllocaInst *, 4> &Allocas) {
|
||||
for (BasicBlock &BB : F) {
|
||||
for (Instruction &I : BB) {
|
||||
auto *Alloca = dyn_cast<AllocaInst>(&I);
|
||||
if (!Alloca)
|
||||
continue;
|
||||
LLVM_DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: ");
|
||||
|
||||
if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false,
|
||||
/* StoreCaptures */ true)) {
|
||||
Allocas.insert(Alloca);
|
||||
LLVM_DEBUG(dbgs() << "YES (captured).\n");
|
||||
} else {
|
||||
LLVM_DEBUG(dbgs() << "NO (not captured).\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca,
|
||||
const DataLayout &DL) {
|
||||
LLVM_DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n");
|
||||
Module *M = Alloca->getModule();
|
||||
assert(M && "Alloca does not have a module");
|
||||
|
||||
PollyIRBuilder Builder(M->getContext());
|
||||
Builder.SetInsertPoint(Alloca);
|
||||
|
||||
Function *MallocManagedFn =
|
||||
getOrCreatePollyMallocManaged(*Alloca->getModule());
|
||||
const uint64_t Size = DL.getTypeAllocSize(Alloca->getAllocatedType());
|
||||
Value *SizeVal = Builder.getInt64(Size);
|
||||
Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal});
|
||||
Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType());
|
||||
|
||||
Function *F = Alloca->getFunction();
|
||||
assert(F && "Alloca has invalid function");
|
||||
|
||||
Bitcasted->takeName(Alloca);
|
||||
Alloca->replaceAllUsesWith(Bitcasted);
|
||||
Alloca->eraseFromParent();
|
||||
|
||||
for (BasicBlock &BB : *F) {
|
||||
ReturnInst *Return = dyn_cast<ReturnInst>(BB.getTerminator());
|
||||
if (!Return)
|
||||
continue;
|
||||
Builder.SetInsertPoint(Return);
|
||||
|
||||
Function *FreeManagedFn = getOrCreatePollyFreeManaged(*M);
|
||||
Builder.CreateCall(FreeManagedFn, {RawManagedMem});
|
||||
}
|
||||
}
|
||||
|
||||
// Replace all uses of `Old` with `New`, even inside `ConstantExpr`.
|
||||
//
|
||||
// `replaceAllUsesWith` does replace values in `ConstantExpr`. This function
|
||||
// actually does replace it in `ConstantExpr`. The caveat is that if there is
|
||||
// a use that is *outside* a function (say, at global declarations), we fail.
|
||||
// So, this is meant to be used on values which we know will only be used
|
||||
// within functions.
|
||||
//
|
||||
// This process works by looking through the uses of `Old`. If it finds a
|
||||
// `ConstantExpr`, it recursively looks for the owning instruction.
|
||||
// Then, it expands all the `ConstantExpr` to instructions and replaces
|
||||
// `Old` with `New` in the expanded instructions.
|
||||
static void replaceAllUsesAndConstantUses(Value *Old, Value *New,
|
||||
PollyIRBuilder &Builder) {
|
||||
SmallVector<Instruction *, 4> UserInstructions;
|
||||
// Get all instructions that use array. We need to do this weird thing
|
||||
// because `Constant`s that contain this array neeed to be expanded into
|
||||
// instructions so that we can replace their parameters. `Constant`s cannot
|
||||
// be edited easily, so we choose to convert all `Constant`s to
|
||||
// `Instruction`s and handle all of the uses of `Array` uniformly.
|
||||
for (Use &ArrayUse : Old->uses())
|
||||
getInstructionUsersOfValue(ArrayUse.getUser(), UserInstructions);
|
||||
|
||||
for (Instruction *I : UserInstructions)
|
||||
rewriteOldValToNew(I, Old, New, Builder);
|
||||
}
|
||||
|
||||
class ManagedMemoryRewritePass final : public ModulePass {
|
||||
public:
|
||||
static char ID;
|
||||
GPUArch Architecture;
|
||||
GPURuntime Runtime;
|
||||
|
||||
ManagedMemoryRewritePass() : ModulePass(ID) {}
|
||||
bool runOnModule(Module &M) override {
|
||||
const DataLayout &DL = M.getDataLayout();
|
||||
|
||||
Function *Malloc = M.getFunction("malloc");
|
||||
|
||||
if (Malloc) {
|
||||
PollyIRBuilder Builder(M.getContext());
|
||||
Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
|
||||
assert(PollyMallocManaged && "unable to create polly_mallocManaged");
|
||||
|
||||
replaceAllUsesAndConstantUses(Malloc, PollyMallocManaged, Builder);
|
||||
Malloc->eraseFromParent();
|
||||
}
|
||||
|
||||
Function *Free = M.getFunction("free");
|
||||
|
||||
if (Free) {
|
||||
PollyIRBuilder Builder(M.getContext());
|
||||
Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M);
|
||||
assert(PollyFreeManaged && "unable to create polly_freeManaged");
|
||||
|
||||
replaceAllUsesAndConstantUses(Free, PollyFreeManaged, Builder);
|
||||
Free->eraseFromParent();
|
||||
}
|
||||
|
||||
SmallPtrSet<GlobalVariable *, 4> GlobalsToErase;
|
||||
for (GlobalVariable &Global : M.globals())
|
||||
replaceGlobalArray(M, DL, Global, GlobalsToErase);
|
||||
for (GlobalVariable *G : GlobalsToErase)
|
||||
G->eraseFromParent();
|
||||
|
||||
// Rewrite allocas to cudaMallocs if we are asked to do so.
|
||||
if (RewriteAllocas) {
|
||||
SmallSet<AllocaInst *, 4> AllocasToBeManaged;
|
||||
for (Function &F : M.functions())
|
||||
getAllocasToBeManaged(F, AllocasToBeManaged);
|
||||
|
||||
for (AllocaInst *Alloca : AllocasToBeManaged)
|
||||
rewriteAllocaAsManagedMemory(Alloca, DL);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
char ManagedMemoryRewritePass::ID = 42;
|
||||
|
||||
Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch,
|
||||
GPURuntime Runtime) {
|
||||
ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass();
|
||||
pass->Runtime = Runtime;
|
||||
pass->Architecture = Arch;
|
||||
return pass;
|
||||
}
|
||||
|
||||
INITIALIZE_PASS_BEGIN(
|
||||
ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
|
||||
"Polly - Rewrite all allocations in heap & data section to managed memory",
|
||||
false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration);
|
||||
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
|
||||
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
|
||||
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
|
||||
INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
|
||||
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
|
||||
INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
|
||||
INITIALIZE_PASS_END(
|
||||
ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
|
||||
"Polly - Rewrite all allocations in heap & data section to managed memory",
|
||||
false, false)
|
||||
File diff suppressed because it is too large
Load Diff
@ -9,7 +9,6 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "polly/CodeGen/RuntimeDebugBuilder.h"
|
||||
#include "llvm/IR/IntrinsicsNVPTX.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@ -17,6 +16,16 @@
|
||||
using namespace llvm;
|
||||
using namespace polly;
|
||||
|
||||
llvm::Value *RuntimeDebugBuilder::getPrintableString(PollyIRBuilder &Builder,
|
||||
llvm::StringRef Str) {
|
||||
// FIXME: addressspace(4) is a marker for a string (for the %s conversion
|
||||
// specifier) but should be using the default address space. This only works
|
||||
// because CPU backends typically ignore the address space. For constant
|
||||
// strings as returned by getPrintableString, the format string should instead
|
||||
// directly spell out the string.
|
||||
return Builder.CreateGlobalStringPtr(Str, "", 4);
|
||||
}
|
||||
|
||||
Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
const char *Name = "vprintf";
|
||||
@ -33,72 +42,9 @@ Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
|
||||
return F;
|
||||
}
|
||||
|
||||
Function *RuntimeDebugBuilder::getAddressSpaceCast(PollyIRBuilder &Builder,
|
||||
unsigned Src, unsigned Dst,
|
||||
unsigned SrcBits,
|
||||
unsigned DstBits) {
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
auto Name = std::string("llvm.nvvm.ptr.constant.to.gen.p") +
|
||||
std::to_string(Dst) + "i" + std::to_string(DstBits) + ".p" +
|
||||
std::to_string(Src) + "i" + std::to_string(SrcBits);
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
FunctionType *Ty = FunctionType::get(
|
||||
PointerType::get(Builder.getIntNTy(DstBits), Dst),
|
||||
PointerType::get(Builder.getIntNTy(SrcBits), Src), false);
|
||||
F = Function::Create(Ty, Linkage, Name, M);
|
||||
}
|
||||
|
||||
return F;
|
||||
}
|
||||
|
||||
std::vector<Value *>
|
||||
RuntimeDebugBuilder::getGPUThreadIdentifiers(PollyIRBuilder &Builder) {
|
||||
std::vector<Value *> Identifiers;
|
||||
|
||||
auto M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
|
||||
std::vector<Function *> BlockIDs = {
|
||||
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_x),
|
||||
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_y),
|
||||
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_z),
|
||||
};
|
||||
|
||||
Identifiers.push_back(Builder.CreateGlobalStringPtr("> block-id: ", "", 4));
|
||||
for (auto GetID : BlockIDs) {
|
||||
Value *Id = Builder.CreateCall(GetID, {});
|
||||
Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
|
||||
Identifiers.push_back(Id);
|
||||
Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
|
||||
}
|
||||
|
||||
Identifiers.push_back(Builder.CreateGlobalStringPtr("| ", "", 4));
|
||||
|
||||
std::vector<Function *> ThreadIDs = {
|
||||
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_x),
|
||||
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_y),
|
||||
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_z),
|
||||
};
|
||||
|
||||
Identifiers.push_back(Builder.CreateGlobalStringPtr("thread-id: ", "", 4));
|
||||
for (auto GetId : ThreadIDs) {
|
||||
Value *Id = Builder.CreateCall(GetId, {});
|
||||
Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
|
||||
Identifiers.push_back(Id);
|
||||
Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
|
||||
}
|
||||
|
||||
return Identifiers;
|
||||
}
|
||||
|
||||
void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, bool IsGPU,
|
||||
void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder,
|
||||
ArrayRef<Value *> Values) {
|
||||
if (IsGPU)
|
||||
createGPUPrinterT(Builder, Values);
|
||||
else
|
||||
createCPUPrinterT(Builder, Values);
|
||||
createCPUPrinterT(Builder, Values);
|
||||
}
|
||||
|
||||
bool RuntimeDebugBuilder::isPrintable(Type *Ty) {
|
||||
@ -169,78 +115,6 @@ void RuntimeDebugBuilder::createCPUPrinterT(PollyIRBuilder &Builder,
|
||||
createFlush(Builder);
|
||||
}
|
||||
|
||||
void RuntimeDebugBuilder::createGPUPrinterT(PollyIRBuilder &Builder,
|
||||
ArrayRef<Value *> Values) {
|
||||
std::string str;
|
||||
|
||||
auto *Zero = Builder.getInt64(0);
|
||||
|
||||
auto ToPrint = getGPUThreadIdentifiers(Builder);
|
||||
|
||||
ToPrint.push_back(Builder.CreateGlobalStringPtr("\n ", "", 4));
|
||||
ToPrint.insert(ToPrint.end(), Values.begin(), Values.end());
|
||||
|
||||
const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
|
||||
|
||||
// Allocate print buffer (assuming 2*32 bit per element)
|
||||
auto T = ArrayType::get(Builder.getInt32Ty(), ToPrint.size() * 2);
|
||||
Value *Data = new AllocaInst(
|
||||
T, DL.getAllocaAddrSpace(), "polly.vprint.buffer",
|
||||
&Builder.GetInsertBlock()->getParent()->getEntryBlock().front());
|
||||
auto *DataPtr = Builder.CreateGEP(T, Data, {Zero, Zero});
|
||||
|
||||
int Offset = 0;
|
||||
for (auto Val : ToPrint) {
|
||||
auto Ptr = Builder.CreateGEP(Builder.getInt32Ty(), DataPtr,
|
||||
Builder.getInt64(Offset));
|
||||
Type *Ty = Val->getType();
|
||||
|
||||
if (Ty->isFloatingPointTy()) {
|
||||
if (!Ty->isDoubleTy())
|
||||
Val = Builder.CreateFPExt(Val, Builder.getDoubleTy());
|
||||
} else if (Ty->isIntegerTy()) {
|
||||
if (Ty->getIntegerBitWidth() < 64) {
|
||||
Val = Builder.CreateSExt(Val, Builder.getInt64Ty());
|
||||
} else {
|
||||
assert(Ty->getIntegerBitWidth() == 64 &&
|
||||
"Integer types larger 64 bit not supported");
|
||||
// fallthrough
|
||||
}
|
||||
} else if (isa<PointerType>(Ty)) {
|
||||
if (Ty == Builder.getInt8PtrTy(4)) {
|
||||
// Pointers in constant address space are printed as strings
|
||||
Val = Builder.CreateGEP(Builder.getInt8Ty(), Val, Builder.getInt64(0));
|
||||
auto F = RuntimeDebugBuilder::getAddressSpaceCast(Builder, 4, 0);
|
||||
Val = Builder.CreateCall(F, Val);
|
||||
} else {
|
||||
Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty());
|
||||
}
|
||||
} else {
|
||||
llvm_unreachable("Unknown type");
|
||||
}
|
||||
|
||||
Ty = Val->getType();
|
||||
Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Ty->getPointerTo(5));
|
||||
Builder.CreateAlignedStore(Val, Ptr, Align(4));
|
||||
|
||||
if (Ty->isFloatingPointTy())
|
||||
str += "%f";
|
||||
else if (Ty->isIntegerTy())
|
||||
str += "%ld";
|
||||
else
|
||||
str += "%s";
|
||||
|
||||
Offset += 2;
|
||||
}
|
||||
|
||||
Value *Format = Builder.CreateGlobalStringPtr(str, "polly.vprintf.buffer", 4);
|
||||
Format = Builder.CreateCall(getAddressSpaceCast(Builder, 4, 0), Format);
|
||||
|
||||
Data = Builder.CreateBitCast(Data, Builder.getInt8PtrTy());
|
||||
|
||||
Builder.CreateCall(getVPrintF(Builder), {Format, Data});
|
||||
}
|
||||
|
||||
Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) {
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
const char *Name = "printf";
|
||||
|
||||
88
polly/lib/External/CMakeLists.txt
vendored
88
polly/lib/External/CMakeLists.txt
vendored
@ -314,91 +314,3 @@ if (POLLY_BUNDLED_ISL)
|
||||
target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
|
||||
target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
|
||||
endif (POLLY_BUNDLED_ISL)
|
||||
|
||||
|
||||
# External: Polyhedral Parallel Code Generator
|
||||
if (GPU_CODEGEN)
|
||||
set(PET_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pet")
|
||||
set(PPCG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ppcg")
|
||||
set(PPCG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/ppcg")
|
||||
|
||||
# Determine version of ppcg
|
||||
if (EXISTS "${PPCG_SOURCE_DIR}/GIT_HEAD_ID")
|
||||
# The source comes from a 'make dist' archive
|
||||
file(READ "${PPCG_SOURCE_DIR}/GIT_HEAD_ID" PPCG_GIT_HEAD_ID)
|
||||
string(STRIP "${PPCG_GIT_HEAD_ID}" PPCG_GIT_HEAD_ID)
|
||||
elseif (EXISTS "${PPCG_SOURCE_DIR}/gitversion.h")
|
||||
# The source directory is preconfigured
|
||||
file(READ "${PPCG_SOURCE_DIR}/gitversion.h" GITVERSION_H)
|
||||
string(REGEX REPLACE ".*\\\"([^\\\"]*)\\\".*" "\\1" PPCG_GIT_HEAD_ID "${GITVERSION_H}")
|
||||
elseif ()
|
||||
# Unknown revision
|
||||
# TODO: We could look for a .git and get the revision from HEAD
|
||||
set(PPCG_GIT_HEAD_ID "UNKNOWN")
|
||||
endif ()
|
||||
|
||||
message(STATUS "PPCG version: ${PPCG_GIT_HEAD_ID}")
|
||||
|
||||
set (PPCG_FILES
|
||||
ppcg/cuda.c
|
||||
ppcg/cuda_common.c
|
||||
ppcg/external.c
|
||||
ppcg/gpu_array_tile.c
|
||||
ppcg/gpu.c
|
||||
ppcg/gpu_array_tile.c
|
||||
ppcg/gpu_group.c
|
||||
ppcg/gpu_hybrid.c
|
||||
ppcg/gpu_print.c
|
||||
ppcg/gpu_tree.c
|
||||
ppcg/grouping.c
|
||||
ppcg/hybrid.c
|
||||
ppcg/ppcg.c
|
||||
ppcg/ppcg_options.c
|
||||
ppcg/print.c
|
||||
ppcg/schedule.c
|
||||
ppcg/util.c
|
||||
)
|
||||
|
||||
include_directories(BEFORE
|
||||
${PPCG_BINARY_DIR}
|
||||
${PPCG_SOURCE_DIR}/imath
|
||||
${PPCG_SOURCE_DIR}/include
|
||||
${PET_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
add_polly_library(PollyPPCG
|
||||
${PPCG_FILES}
|
||||
)
|
||||
|
||||
target_link_libraries(PollyPPCG PUBLIC ${ISL_TARGET})
|
||||
|
||||
# Disable warnings for upstream projects.
|
||||
if (MSVC)
|
||||
set(DISABLE_WARNING_FLAGS
|
||||
-wd4018 # 'expression' : signed/unsigned mismatch
|
||||
-wd4090 # 'operation' : different 'modifier' qualifiers
|
||||
-wd4200 # nonstandard extension used: zero-sized array in struct/union
|
||||
-wd4201 # nonstandard extension used: nameless struct/union
|
||||
-wd4334 # 'operator': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
|
||||
-wd4221 # nonstandard extension used : 'identifier' : cannot be initialized using address of automatic variable
|
||||
)
|
||||
if (POLLY_BUNDLED_ISL)
|
||||
target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
|
||||
target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
|
||||
endif (POLLY_BUNDLED_ISL)
|
||||
target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
|
||||
else ()
|
||||
if (POLLY_BUNDLED_ISL)
|
||||
set_target_properties(PollyISL polly-isl-test PROPERTIES COMPILE_FLAGS "-w")
|
||||
endif (POLLY_BUNDLED_ISL)
|
||||
set_target_properties(PollyPPCG PROPERTIES COMPILE_FLAGS "-w")
|
||||
endif ()
|
||||
|
||||
if(MSVC)
|
||||
# In the Windows API (with some exceptions), the maximum length for a path is
|
||||
# MAX_PATH, which is defined as 260 characters.
|
||||
target_compile_definitions(PollyPPCG PRIVATE "-DPATH_MAX=260")
|
||||
endif ()
|
||||
|
||||
target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
|
||||
endif ()
|
||||
|
||||
622
polly/lib/External/pet/include/pet.h
vendored
622
polly/lib/External/pet/include/pet.h
vendored
@ -1,622 +0,0 @@
|
||||
#ifndef PET_H
|
||||
#define PET_H
|
||||
|
||||
#include <isl/aff.h>
|
||||
#include <isl/arg.h>
|
||||
#include <isl/ast_build.h>
|
||||
#include <isl/set.h>
|
||||
#include <isl/map.h>
|
||||
#include <isl/union_map.h>
|
||||
#include <isl/printer.h>
|
||||
#include <isl/id_to_ast_expr.h>
|
||||
#include <isl/id_to_pw_aff.h>
|
||||
#include <isl/schedule.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct pet_options;
|
||||
ISL_ARG_DECL(pet_options, struct pet_options, pet_options_args)
|
||||
|
||||
/* Create an isl_ctx that references the pet options. */
|
||||
isl_ctx *isl_ctx_alloc_with_pet_options();
|
||||
|
||||
/* If autodetect is set, any valid scop is extracted.
|
||||
* Otherwise, the scop needs to be delimited by pragmas.
|
||||
*/
|
||||
int pet_options_set_autodetect(isl_ctx *ctx, int val);
|
||||
int pet_options_get_autodetect(isl_ctx *ctx);
|
||||
|
||||
int pet_options_set_detect_conditional_assignment(isl_ctx *ctx, int val);
|
||||
int pet_options_get_detect_conditional_assignment(isl_ctx *ctx);
|
||||
|
||||
/* If encapsulate-dynamic-control is set, then any dynamic control
|
||||
* in the input program will be encapsulated in macro statements.
|
||||
* This means in particular that no statements with arguments
|
||||
* will be created.
|
||||
*/
|
||||
int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val);
|
||||
int pet_options_get_encapsulate_dynamic_control(isl_ctx *ctx);
|
||||
|
||||
#define PET_OVERFLOW_AVOID 0
|
||||
#define PET_OVERFLOW_IGNORE 1
|
||||
int pet_options_set_signed_overflow(isl_ctx *ctx, int val);
|
||||
int pet_options_get_signed_overflow(isl_ctx *ctx);
|
||||
|
||||
struct pet_loc;
|
||||
typedef struct pet_loc pet_loc;
|
||||
|
||||
/* Return an additional reference to "loc". */
|
||||
__isl_give pet_loc *pet_loc_copy(__isl_keep pet_loc *loc);
|
||||
/* Free a reference to "loc". */
|
||||
pet_loc *pet_loc_free(__isl_take pet_loc *loc);
|
||||
|
||||
/* Return the offset in the input file of the start of "loc". */
|
||||
unsigned pet_loc_get_start(__isl_keep pet_loc *loc);
|
||||
/* Return the offset in the input file of the character after "loc". */
|
||||
unsigned pet_loc_get_end(__isl_keep pet_loc *loc);
|
||||
/* Return the line number of a line within the "loc" region. */
|
||||
int pet_loc_get_line(__isl_keep pet_loc *loc);
|
||||
/* Return the indentation of the "loc" region. */
|
||||
__isl_keep const char *pet_loc_get_indent(__isl_keep pet_loc *loc);
|
||||
|
||||
enum pet_expr_type {
|
||||
pet_expr_error = -1,
|
||||
pet_expr_access,
|
||||
pet_expr_call,
|
||||
pet_expr_cast,
|
||||
pet_expr_int,
|
||||
pet_expr_double,
|
||||
pet_expr_op
|
||||
};
|
||||
|
||||
enum pet_op_type {
|
||||
/* only compound assignments operators before assignment */
|
||||
pet_op_add_assign,
|
||||
pet_op_sub_assign,
|
||||
pet_op_mul_assign,
|
||||
pet_op_div_assign,
|
||||
pet_op_and_assign,
|
||||
pet_op_xor_assign,
|
||||
pet_op_or_assign,
|
||||
pet_op_assign,
|
||||
pet_op_add,
|
||||
pet_op_sub,
|
||||
pet_op_mul,
|
||||
pet_op_div,
|
||||
pet_op_mod,
|
||||
pet_op_shl,
|
||||
pet_op_shr,
|
||||
pet_op_eq,
|
||||
pet_op_ne,
|
||||
pet_op_le,
|
||||
pet_op_ge,
|
||||
pet_op_lt,
|
||||
pet_op_gt,
|
||||
pet_op_minus,
|
||||
pet_op_post_inc,
|
||||
pet_op_post_dec,
|
||||
pet_op_pre_inc,
|
||||
pet_op_pre_dec,
|
||||
pet_op_address_of,
|
||||
pet_op_assume,
|
||||
pet_op_kill,
|
||||
pet_op_and,
|
||||
pet_op_xor,
|
||||
pet_op_or,
|
||||
pet_op_not,
|
||||
pet_op_land,
|
||||
pet_op_lor,
|
||||
pet_op_lnot,
|
||||
pet_op_cond,
|
||||
pet_op_last
|
||||
};
|
||||
|
||||
/* Index into the pet_expr->args array when pet_expr->type == pet_expr_unary
|
||||
*/
|
||||
enum pet_un_arg_type {
|
||||
pet_un_arg
|
||||
};
|
||||
|
||||
/* Indices into the pet_expr->args array when
|
||||
* pet_expr->type == pet_expr_binary
|
||||
*/
|
||||
enum pet_bin_arg_type {
|
||||
pet_bin_lhs,
|
||||
pet_bin_rhs
|
||||
};
|
||||
|
||||
/* Indices into the pet_expr->args array when
|
||||
* pet_expr->type == pet_expr_ternary
|
||||
*/
|
||||
enum pet_ter_arg_type {
|
||||
pet_ter_cond,
|
||||
pet_ter_true,
|
||||
pet_ter_false
|
||||
};
|
||||
|
||||
struct pet_expr;
|
||||
typedef struct pet_expr pet_expr;
|
||||
|
||||
/* Return an additional reference to "expr". */
|
||||
__isl_give pet_expr *pet_expr_copy(__isl_keep pet_expr *expr);
|
||||
/* Free a reference to "expr". */
|
||||
__isl_null pet_expr *pet_expr_free(__isl_take pet_expr *expr);
|
||||
|
||||
/* Return the isl_ctx in which "expr" was created. */
|
||||
isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr);
|
||||
|
||||
/* Return the type of "expr". */
|
||||
enum pet_expr_type pet_expr_get_type(__isl_keep pet_expr *expr);
|
||||
/* Return the number of arguments of "expr". */
|
||||
int pet_expr_get_n_arg(__isl_keep pet_expr *expr);
|
||||
/* Set the number of arguments of "expr" to "n". */
|
||||
__isl_give pet_expr *pet_expr_set_n_arg(__isl_take pet_expr *expr, int n);
|
||||
/* Return the argument of "expr" at position "pos". */
|
||||
__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos);
|
||||
/* Replace the argument of "expr" at position "pos" by "arg". */
|
||||
__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
|
||||
__isl_take pet_expr *arg);
|
||||
|
||||
/* Return the operation type of operation expression "expr". */
|
||||
enum pet_op_type pet_expr_op_get_type(__isl_keep pet_expr *expr);
|
||||
/* Replace the operation type of operation expression "expr" by "type". */
|
||||
__isl_give pet_expr *pet_expr_op_set_type(__isl_take pet_expr *expr,
|
||||
enum pet_op_type type);
|
||||
|
||||
/* Construct a (read) access pet_expr from an index expression. */
|
||||
__isl_give pet_expr *pet_expr_from_index(__isl_take isl_multi_pw_aff *index);
|
||||
|
||||
/* Does "expr" represent an affine expression? */
|
||||
isl_bool pet_expr_is_affine(__isl_keep pet_expr *expr);
|
||||
/* Does the access expression "expr" read the accessed elements? */
|
||||
isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr);
|
||||
/* Does the access expression "expr" write to the accessed elements? */
|
||||
isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr);
|
||||
/* Does the access expression "expr" kill the accessed elements? */
|
||||
isl_bool pet_expr_access_is_kill(__isl_keep pet_expr *expr);
|
||||
/* Mark "expr" as a read depending on "read". */
|
||||
__isl_give pet_expr *pet_expr_access_set_read(__isl_take pet_expr *expr,
|
||||
int read);
|
||||
/* Mark "expr" as a write depending on "write". */
|
||||
__isl_give pet_expr *pet_expr_access_set_write(__isl_take pet_expr *expr,
|
||||
int write);
|
||||
/* Mark "expr" as a kill depending on "kill". */
|
||||
__isl_give pet_expr *pet_expr_access_set_kill(__isl_take pet_expr *expr,
|
||||
int kill);
|
||||
/* Return the reference identifier of access expression "expr". */
|
||||
__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr);
|
||||
/* Replace the reference identifier of access expression "expr" by "ref_id". */
|
||||
__isl_give pet_expr *pet_expr_access_set_ref_id(__isl_take pet_expr *expr,
|
||||
__isl_take isl_id *ref_id);
|
||||
/* Return the identifier of the outer array accessed by "expr". */
|
||||
__isl_give isl_id *pet_expr_access_get_id(__isl_keep pet_expr *expr);
|
||||
/* Return the index expression of access expression "expr". */
|
||||
__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
|
||||
__isl_keep pet_expr *expr);
|
||||
|
||||
/* Return the potential read access relation of access expression "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_may_read(
|
||||
__isl_keep pet_expr *expr);
|
||||
/* Return the potential write access relation of access expression "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_may_write(
|
||||
__isl_keep pet_expr *expr);
|
||||
/* Return the definite write access relation of access expression "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_must_write(
|
||||
__isl_keep pet_expr *expr);
|
||||
/* Return the argument dependent potential read access relation of "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_dependent_may_read(
|
||||
__isl_keep pet_expr *expr);
|
||||
/* Return the argument dependent potential write access relation of "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_dependent_may_write(
|
||||
__isl_keep pet_expr *expr);
|
||||
/* Return the argument dependent definite write access relation of "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_dependent_must_write(
|
||||
__isl_keep pet_expr *expr);
|
||||
/* Return the tagged potential read access relation of access "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
|
||||
__isl_keep pet_expr *expr);
|
||||
/* Return the tagged potential write access relation of access "expr". */
|
||||
__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
|
||||
__isl_keep pet_expr *expr);
|
||||
|
||||
/* Return the name of the function called by "expr". */
|
||||
__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr);
|
||||
/* Replace the name of the function called by "expr" by "name". */
|
||||
__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
|
||||
__isl_keep const char *name);
|
||||
|
||||
/* Create a pet_expr representing a cast of "arg" to "type_name". */
|
||||
__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
|
||||
__isl_take pet_expr *arg);
|
||||
/* Replace the type of the cast performed by "expr" by "name". */
|
||||
__isl_give pet_expr *pet_expr_cast_set_type_name(__isl_take pet_expr *expr,
|
||||
__isl_keep const char *name);
|
||||
|
||||
/* Return the value of the integer represented by "expr". */
|
||||
__isl_give isl_val *pet_expr_int_get_val(__isl_keep pet_expr *expr);
|
||||
/* Replace the value of the integer represented by "expr" by "v". */
|
||||
__isl_give pet_expr *pet_expr_int_set_val(__isl_take pet_expr *expr,
|
||||
__isl_take isl_val *v);
|
||||
|
||||
/* Return a string representation of the double expression "expr". */
|
||||
__isl_give char *pet_expr_double_get_str(__isl_keep pet_expr *expr);
|
||||
/* Replace value and string representation of the double expression "expr" */
|
||||
__isl_give pet_expr *pet_expr_double_set(__isl_take pet_expr *expr,
|
||||
double d, __isl_keep const char *s);
|
||||
|
||||
/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_access. */
|
||||
int pet_expr_foreach_access_expr(__isl_keep pet_expr *expr,
|
||||
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
|
||||
/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_call. */
|
||||
int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
|
||||
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
|
||||
|
||||
struct pet_context;
|
||||
typedef struct pet_context pet_context;
|
||||
|
||||
/* Create a context with the given domain. */
|
||||
__isl_give pet_context *pet_context_alloc(__isl_take isl_set *domain);
|
||||
/* Return an additional reference to "pc". */
|
||||
__isl_give pet_context *pet_context_copy(__isl_keep pet_context *pc);
|
||||
/* Free a reference to "pc". */
|
||||
__isl_null pet_context *pet_context_free(__isl_take pet_context *pc);
|
||||
|
||||
/* Return the isl_ctx in which "pc" was created. */
|
||||
isl_ctx *pet_context_get_ctx(__isl_keep pet_context *pc);
|
||||
|
||||
/* Extract an affine expression defined over the domain of "pc" from "expr"
|
||||
* or return NaN.
|
||||
*/
|
||||
__isl_give isl_pw_aff *pet_expr_extract_affine(__isl_keep pet_expr *expr,
|
||||
__isl_keep pet_context *pc);
|
||||
|
||||
void pet_expr_dump(__isl_keep pet_expr *expr);
|
||||
|
||||
enum pet_tree_type {
|
||||
pet_tree_error = -1,
|
||||
pet_tree_expr,
|
||||
pet_tree_block,
|
||||
pet_tree_break,
|
||||
pet_tree_continue,
|
||||
pet_tree_decl, /* A declaration without initialization */
|
||||
pet_tree_decl_init, /* A declaration with initialization */
|
||||
pet_tree_if, /* An if without an else branch */
|
||||
pet_tree_if_else, /* An if with an else branch */
|
||||
pet_tree_for,
|
||||
pet_tree_infinite_loop,
|
||||
pet_tree_while,
|
||||
pet_tree_return,
|
||||
};
|
||||
|
||||
struct pet_tree;
|
||||
typedef struct pet_tree pet_tree;
|
||||
|
||||
/* Return the isl_ctx in which "tree" was created. */
|
||||
isl_ctx *pet_tree_get_ctx(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Return an additional reference to "tree". */
|
||||
__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree);
|
||||
/* Free a reference to "tree". */
|
||||
__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree);
|
||||
|
||||
/* Return the location of "tree". */
|
||||
__isl_give pet_loc *pet_tree_get_loc(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Return the type of "tree". */
|
||||
enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Return the expression of the expression tree "tree". */
|
||||
__isl_give pet_expr *pet_tree_expr_get_expr(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Return the expression returned by the return tree "tree". */
|
||||
__isl_give pet_expr *pet_tree_return_get_expr(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Return the number of children of the block tree "tree". */
|
||||
int pet_tree_block_n_child(__isl_keep pet_tree *tree);
|
||||
/* Return child "pos" of the block tree "tree". */
|
||||
__isl_give pet_tree *pet_tree_block_get_child(__isl_keep pet_tree *tree,
|
||||
int pos);
|
||||
|
||||
/* Is "tree" a declaration (with or without initialization)? */
|
||||
int pet_tree_is_decl(__isl_keep pet_tree *tree);
|
||||
/* Return the variable declared by the declaration tree "tree". */
|
||||
__isl_give pet_expr *pet_tree_decl_get_var(__isl_keep pet_tree *tree);
|
||||
/* Return the initial value of the pet_tree_decl_init tree "tree". */
|
||||
__isl_give pet_expr *pet_tree_decl_get_init(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Return the condition of the if tree "tree". */
|
||||
__isl_give pet_expr *pet_tree_if_get_cond(__isl_keep pet_tree *tree);
|
||||
/* Return the then branch of the if tree "tree". */
|
||||
__isl_give pet_tree *pet_tree_if_get_then(__isl_keep pet_tree *tree);
|
||||
/* Return the else branch of the if tree with else branch "tree". */
|
||||
__isl_give pet_tree *pet_tree_if_get_else(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Is "tree" a for loop, a while loop or an infinite loop? */
|
||||
int pet_tree_is_loop(__isl_keep pet_tree *tree);
|
||||
/* Return the induction variable of the for loop "tree" */
|
||||
__isl_give pet_expr *pet_tree_loop_get_var(__isl_keep pet_tree *tree);
|
||||
/* Return the initial value of the induction variable of the for loop "tree" */
|
||||
__isl_give pet_expr *pet_tree_loop_get_init(__isl_keep pet_tree *tree);
|
||||
/* Return the condition of the loop tree "tree" */
|
||||
__isl_give pet_expr *pet_tree_loop_get_cond(__isl_keep pet_tree *tree);
|
||||
/* Return the induction variable of the for loop "tree" */
|
||||
__isl_give pet_expr *pet_tree_loop_get_inc(__isl_keep pet_tree *tree);
|
||||
/* Return the body of the loop tree "tree" */
|
||||
__isl_give pet_tree *pet_tree_loop_get_body(__isl_keep pet_tree *tree);
|
||||
|
||||
/* Call "fn" on each top-level expression in the nodes of "tree" */
|
||||
int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
|
||||
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
|
||||
/* Call "fn" on each access subexpression in the nodes of "tree" */
|
||||
int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
|
||||
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
|
||||
/* Modify all call subexpressions in the nodes of "tree" through "fn". */
|
||||
__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
|
||||
__isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
|
||||
void *user);
|
||||
|
||||
void pet_tree_dump(__isl_keep pet_tree *tree);
|
||||
|
||||
/* "loc" represents the region of the source code that is represented
|
||||
* by this statement.
|
||||
*
|
||||
* If the statement has arguments, i.e., n_arg != 0, then
|
||||
* "domain" is a wrapped map, mapping the iteration domain
|
||||
* to the values of the arguments for which this statement
|
||||
* is executed.
|
||||
* Otherwise, it is simply the iteration domain.
|
||||
*
|
||||
* If one of the arguments is an access expression that accesses
|
||||
* more than one element for a given iteration, then the constraints
|
||||
* on the value of this argument (encoded in "domain") should be satisfied
|
||||
* for all of those accessed elements.
|
||||
*/
|
||||
struct pet_stmt {
|
||||
pet_loc *loc;
|
||||
isl_set *domain;
|
||||
pet_tree *body;
|
||||
|
||||
unsigned n_arg;
|
||||
pet_expr **args;
|
||||
};
|
||||
|
||||
/* Return the iteration space of "stmt". */
|
||||
__isl_give isl_space *pet_stmt_get_space(struct pet_stmt *stmt);
|
||||
|
||||
/* Is "stmt" an assignment statement? */
|
||||
int pet_stmt_is_assign(struct pet_stmt *stmt);
|
||||
/* Is "stmt" a kill statement? */
|
||||
int pet_stmt_is_kill(struct pet_stmt *stmt);
|
||||
|
||||
/* pet_stmt_build_ast_exprs is currently limited to only handle
|
||||
* some forms of data dependent accesses.
|
||||
* If pet_stmt_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
|
||||
* can safely be called on "stmt".
|
||||
*/
|
||||
int pet_stmt_can_build_ast_exprs(struct pet_stmt *stmt);
|
||||
/* Construct an associative array from reference identifiers of
|
||||
* access expressions in "stmt" to the corresponding isl_ast_expr.
|
||||
* Each index expression is first transformed through "fn_index"
|
||||
* (if not NULL). Then an AST expression is generated using "build".
|
||||
* Finally, the AST expression is transformed using "fn_expr"
|
||||
* (if not NULL).
|
||||
*/
|
||||
__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
|
||||
__isl_keep isl_ast_build *build,
|
||||
__isl_give isl_multi_pw_aff *(*fn_index)(
|
||||
__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
|
||||
void *user), void *user_index,
|
||||
__isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
|
||||
__isl_keep isl_id *id, void *user), void *user_expr);
|
||||
|
||||
/* Print "stmt" to "p".
|
||||
*
|
||||
* The access expressions in "stmt" are replaced by the isl_ast_expr
|
||||
* associated to its reference identifier in "ref2expr".
|
||||
*/
|
||||
__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
|
||||
__isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr);
|
||||
|
||||
/* This structure represents a defined type.
|
||||
* "name" is the name of the type, while "definition" is a string
|
||||
* representation of its definition.
|
||||
*/
|
||||
struct pet_type {
|
||||
char *name;
|
||||
char *definition;
|
||||
};
|
||||
|
||||
/* context holds constraints on the parameter that ensure that
|
||||
* this array has a valid (i.e., non-negative) size
|
||||
*
|
||||
* extent holds constraints on the indices
|
||||
*
|
||||
* value_bounds holds constraints on the elements of the array
|
||||
* and may be NULL if no such constraints were specified by the user
|
||||
*
|
||||
* element_size is the size in bytes of each array element
|
||||
* element_type is the type of the array elements.
|
||||
* element_is_record is set if this type is a record type.
|
||||
*
|
||||
* live_out is set if the array appears in a live-out pragma
|
||||
*
|
||||
* if uniquely_defined is set then the array is written by a single access
|
||||
* such that any element that is ever read
|
||||
* is known to be assigned exactly once before the read
|
||||
*
|
||||
* declared is set if the array was declared somewhere inside the scop.
|
||||
* exposed is set if the declared array is visible outside the scop.
|
||||
* outer is set if the type of the array elements is a record and
|
||||
* the fields of this record are represented by separate pet_array structures.
|
||||
*/
|
||||
struct pet_array {
|
||||
isl_set *context;
|
||||
isl_set *extent;
|
||||
isl_set *value_bounds;
|
||||
char *element_type;
|
||||
int element_is_record;
|
||||
int element_size;
|
||||
int live_out;
|
||||
int uniquely_defined;
|
||||
int declared;
|
||||
int exposed;
|
||||
int outer;
|
||||
};
|
||||
|
||||
/* This structure represents an implication on a boolean filter.
|
||||
* In particular, if the filter value of an element in the domain
|
||||
* of "extension" is equal to "satisfied", then the filter values
|
||||
* of the corresponding images in "extension" are also equal
|
||||
* to "satisfied".
|
||||
*/
|
||||
struct pet_implication {
|
||||
int satisfied;
|
||||
isl_map *extension;
|
||||
};
|
||||
|
||||
/* This structure represents an independence implied by a for loop
|
||||
* that is marked as independent in the source code.
|
||||
* "filter" contains pairs of statement instances that are guaranteed
|
||||
* not to be dependent on each other based on the independent for loop,
|
||||
* assuming that no dependences carried by this loop are implied
|
||||
* by the variables in "local".
|
||||
* "local" contains the variables that are local to the loop that was
|
||||
* marked independent.
|
||||
*/
|
||||
struct pet_independence {
|
||||
isl_union_map *filter;
|
||||
isl_union_set *local;
|
||||
};
|
||||
|
||||
/* "loc" represents the region of the source code that is represented
|
||||
* by this scop.
|
||||
* If the scop was detected based on scop and endscop pragmas, then
|
||||
* the lines containing these pragmas are included in this region.
|
||||
* In the final result, the context describes the set of parameter values
|
||||
* for which the scop can be executed.
|
||||
* During the construction of the pet_scop, the context lives in a set space
|
||||
* where each dimension refers to an outer loop.
|
||||
* context_value describes assignments to the parameters (if any)
|
||||
* outside of the scop.
|
||||
*
|
||||
* "schedule" is the schedule of the statements in the scop.
|
||||
*
|
||||
* The n_type types define types that may be referenced from by the arrays.
|
||||
*
|
||||
* The n_implication implications describe implications on boolean filters.
|
||||
*
|
||||
* The n_independence independences describe independences implied
|
||||
* by for loops that are marked independent in the source code.
|
||||
*/
|
||||
struct pet_scop {
|
||||
pet_loc *loc;
|
||||
|
||||
isl_set *context;
|
||||
isl_set *context_value;
|
||||
isl_schedule *schedule;
|
||||
|
||||
int n_type;
|
||||
struct pet_type **types;
|
||||
|
||||
int n_array;
|
||||
struct pet_array **arrays;
|
||||
|
||||
int n_stmt;
|
||||
struct pet_stmt **stmts;
|
||||
|
||||
int n_implication;
|
||||
struct pet_implication **implications;
|
||||
|
||||
int n_independence;
|
||||
struct pet_independence **independences;
|
||||
};
|
||||
typedef struct pet_scop pet_scop;
|
||||
|
||||
/* Return a textual representation of the operator. */
|
||||
const char *pet_op_str(enum pet_op_type op);
|
||||
int pet_op_is_inc_dec(enum pet_op_type op);
|
||||
|
||||
/* Extract a pet_scop from a C source file.
|
||||
* If function is not NULL, then the pet_scop is extracted from
|
||||
* a function with that name.
|
||||
*/
|
||||
__isl_give pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx,
|
||||
const char *filename, const char *function);
|
||||
|
||||
/* Transform the C source file "input" by rewriting each scop
|
||||
* When autodetecting scops, at most one scop per function is rewritten.
|
||||
* The transformed C code is written to "output".
|
||||
*/
|
||||
int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
|
||||
__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
|
||||
__isl_take pet_scop *scop, void *user), void *user);
|
||||
/* Given a scop and a printer passed to a pet_transform_C_source callback,
|
||||
* print the original corresponding code to the printer.
|
||||
*/
|
||||
__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
|
||||
__isl_take isl_printer *p);
|
||||
|
||||
/* Update all isl_sets and isl_maps such that they all have the same
|
||||
* parameters in the same order.
|
||||
*/
|
||||
__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop);
|
||||
|
||||
/* Does "scop" contain any data dependent accesses? */
|
||||
int pet_scop_has_data_dependent_accesses(__isl_keep pet_scop *scop);
|
||||
/* Does "scop" contain any data dependent conditions? */
|
||||
int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop);
|
||||
/* pet_stmt_build_ast_exprs is currently limited to only handle
|
||||
* some forms of data dependent accesses.
|
||||
* If pet_scop_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
|
||||
* can safely be called on all statements in the scop.
|
||||
*/
|
||||
int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop);
|
||||
|
||||
void pet_scop_dump(__isl_keep pet_scop *scop);
|
||||
__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop);
|
||||
|
||||
/* Return the context of "scop". */
|
||||
__isl_give isl_set *pet_scop_get_context(__isl_keep pet_scop *scop);
|
||||
/* Return the schedule of "scop". */
|
||||
__isl_give isl_schedule *pet_scop_get_schedule(__isl_keep pet_scop *scop);
|
||||
/* Return the set of all statement instances. */
|
||||
__isl_give isl_union_set *pet_scop_get_instance_set(__isl_keep pet_scop *scop);
|
||||
/* Return the potential read access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop);
|
||||
/* Return the tagged potential read access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
|
||||
__isl_keep pet_scop *scop);
|
||||
/* Return the potential write access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop);
|
||||
/* Return the definite write access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop);
|
||||
/* Return the tagged potential write access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
|
||||
__isl_keep pet_scop *scop);
|
||||
/* Return the tagged definite write access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
|
||||
__isl_keep pet_scop *scop);
|
||||
/* Return the definite kill access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop);
|
||||
/* Return the tagged definite kill access relation. */
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
|
||||
__isl_keep pet_scop *scop);
|
||||
|
||||
/* Compute a mapping from all outermost arrays (of structs) in scop
|
||||
* to their innermost members.
|
||||
*/
|
||||
__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
|
||||
__isl_keep pet_scop *scop);
|
||||
/* Compute a mapping from all outermost arrays (of structs) in scop
|
||||
* to their members, including the outermost arrays themselves.
|
||||
*/
|
||||
__isl_give isl_union_map *pet_scop_compute_outer_to_any(
|
||||
__isl_keep pet_scop *scop);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
29
polly/lib/External/ppcg/ChangeLog
vendored
29
polly/lib/External/ppcg/ChangeLog
vendored
@ -1,29 +0,0 @@
|
||||
version: 0.07
|
||||
date: Tue Feb 7 17:23:22 CET 2017
|
||||
changes:
|
||||
- support hybrid tiling
|
||||
---
|
||||
version: 0.06
|
||||
date: Fri May 6 12:08:50 CEST 2016
|
||||
changes:
|
||||
- use PPCG specific macro names in generated code
|
||||
- complete transition to schedule trees
|
||||
- maximize coincidence by default
|
||||
- map arrays with constant index expressions to private memory
|
||||
- optionally group chains of statements
|
||||
---
|
||||
version: 0.05
|
||||
date: Fri Jan 15 09:30:23 CET 2016
|
||||
changes:
|
||||
- fix live-out computation
|
||||
- optionally compute schedule for C target
|
||||
- optionally perform tiling for C target
|
||||
- create single kernel for non-permutable subtree
|
||||
---
|
||||
version: 0.04
|
||||
date: Wed Jun 17 10:52:58 CEST 2015
|
||||
changes:
|
||||
- use schedule trees
|
||||
- fix live-range reordering
|
||||
- improve generation of synchronization
|
||||
- exploit independences during dependence analysis
|
||||
1
polly/lib/External/ppcg/GIT_HEAD_ID
vendored
1
polly/lib/External/ppcg/GIT_HEAD_ID
vendored
@ -1 +0,0 @@
|
||||
ppcg-0.07
|
||||
246
polly/lib/External/ppcg/README
vendored
246
polly/lib/External/ppcg/README
vendored
@ -1,246 +0,0 @@
|
||||
Requirements:
|
||||
|
||||
- automake, autoconf, libtool
|
||||
(not needed when compiling a release)
|
||||
- pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config)
|
||||
(not needed when compiling a release using the included isl and pet)
|
||||
- gmp (http://gmplib.org/)
|
||||
- libyaml (http://pyyaml.org/wiki/LibYAML)
|
||||
(only needed if you want to compile the pet executable)
|
||||
- LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html)
|
||||
Unless you have some other reasons for wanting to use the svn version,
|
||||
it is best to install the latest release (3.9).
|
||||
For more details, see pet/README.
|
||||
|
||||
If you are installing on Ubuntu, then you can install the following packages:
|
||||
|
||||
automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm
|
||||
|
||||
Note that you need at least version 3.2 of libclang-dev (ubuntu raring).
|
||||
Older versions of this package did not include the required libraries.
|
||||
If you are using an older version of ubuntu, then you need to compile and
|
||||
install LLVM/clang from source.
|
||||
|
||||
|
||||
Preparing:
|
||||
|
||||
Grab the latest release and extract it or get the source from
|
||||
the git repository as follows. This process requires autoconf,
|
||||
automake, libtool and pkg-config.
|
||||
|
||||
git clone git://repo.or.cz/ppcg.git
|
||||
cd ppcg
|
||||
./get_submodules.sh
|
||||
./autogen.sh
|
||||
|
||||
|
||||
Compilation:
|
||||
|
||||
./configure
|
||||
make
|
||||
make check
|
||||
|
||||
If you have installed any of the required libraries in a non-standard
|
||||
location, then you may need to use the --with-gmp-prefix,
|
||||
--with-libyaml-prefix and/or --with-clang-prefix options
|
||||
when calling "./configure".
|
||||
|
||||
|
||||
Using PPCG to generate CUDA or OpenCL code
|
||||
|
||||
To convert a fragment of a C program to CUDA, insert a line containing
|
||||
|
||||
#pragma scop
|
||||
|
||||
before the fragment and add a line containing
|
||||
|
||||
#pragma endscop
|
||||
|
||||
after the fragment. To generate CUDA code run
|
||||
|
||||
ppcg --target=cuda file.c
|
||||
|
||||
where file.c is the file containing the fragment. The generated
|
||||
code is stored in file_host.cu and file_kernel.cu.
|
||||
|
||||
To generate OpenCL code run
|
||||
|
||||
ppcg --target=opencl file.c
|
||||
|
||||
where file.c is the file containing the fragment. The generated code
|
||||
is stored in file_host.c and file_kernel.cl.
|
||||
|
||||
|
||||
Specifying tile, grid and block sizes
|
||||
|
||||
The iterations space tile size, grid size and block size can
|
||||
be specified using the --sizes option. The argument is a union map
|
||||
in isl notation mapping kernels identified by their sequence number
|
||||
in a "kernel" space to singleton sets in the "tile", "grid" and "block"
|
||||
spaces. The sizes are specified outermost to innermost.
|
||||
|
||||
The dimension of the "tile" space indicates the (maximal) number of loop
|
||||
dimensions to tile. The elements of the single integer tuple
|
||||
specify the tile sizes in each dimension.
|
||||
In case of hybrid tiling, the first element is half the size of
|
||||
the tile in the time (sequential) dimension. The second element
|
||||
specifies the number of elements in the base of the hexagon.
|
||||
The remaining elements specify the tile sizes in the remaining space
|
||||
dimensions.
|
||||
|
||||
The dimension of the "grid" space indicates the (maximal) number of block
|
||||
dimensions in the grid. The elements of the single integer tuple
|
||||
specify the number of blocks in each dimension.
|
||||
|
||||
The dimension of the "block" space indicates the (maximal) number of thread
|
||||
dimensions in the grid. The elements of the single integer tuple
|
||||
specify the number of threads in each dimension.
|
||||
|
||||
For example,
|
||||
|
||||
{ kernel[0] -> tile[64,64]; kernel[i] -> block[16] : i != 4 }
|
||||
|
||||
specifies that in kernel 0, two loops should be tiled with a tile
|
||||
size of 64 in both dimensions and that all kernels except kernel 4
|
||||
should be run using a block of 16 threads.
|
||||
|
||||
Since PPCG performs some scheduling, it can be difficult to predict
|
||||
what exactly will end up in a kernel. If you want to specify
|
||||
tile, grid or block sizes, you may want to run PPCG first with the defaults,
|
||||
examine the kernels and then run PPCG again with the desired sizes.
|
||||
Instead of examining the kernels, you can also specify the option
|
||||
--dump-sizes on the first run to obtain the effectively used default sizes.
|
||||
|
||||
|
||||
Compiling the generated CUDA code with nvcc
|
||||
|
||||
To get optimal performance from nvcc, it is important to choose --arch
|
||||
according to your target GPU. Specifically, use the flag "--arch sm_20"
|
||||
for fermi, "--arch sm_30" for GK10x Kepler and "--arch sm_35" for
|
||||
GK110 Kepler. We discourage the use of older cards as we have seen
|
||||
correctness issues with compilation for older architectures.
|
||||
Note that in the absence of any --arch flag, nvcc defaults to
|
||||
"--arch sm_13". This will not only be slower, but can also cause
|
||||
correctness issues.
|
||||
If you want to obtain results that are identical to those obtained
|
||||
by the original code, then you may need to disable some optimizations
|
||||
by passing the "--fmad=false" option.
|
||||
|
||||
|
||||
Compiling the generated OpenCL code with gcc
|
||||
|
||||
To compile the host code you need to link against the file
|
||||
ocl_utilities.c which contains utility functions used by the generated
|
||||
OpenCL host code. To compile the host code with gcc, run
|
||||
|
||||
gcc -std=c99 file_host.c ocl_utilities.c -lOpenCL
|
||||
|
||||
Note that we have experienced the generated OpenCL code freezing
|
||||
on some inputs (e.g., the PolyBench symm benchmark) when using
|
||||
at least some version of the Nvidia OpenCL library, while the
|
||||
corresponding CUDA code runs fine.
|
||||
We have experienced no such freezes when using AMD, ARM or Intel
|
||||
OpenCL libraries.
|
||||
|
||||
By default, the compiled executable will need the _kernel.cl file at
|
||||
run time. Alternatively, the option --opencl-embed-kernel-code may be
|
||||
given to place the kernel code in a string literal. The kernel code is
|
||||
then compiled into the host binary, such that the _kernel.cl file is no
|
||||
longer needed at run time. Any kernel include files, in particular
|
||||
those supplied using --opencl-include-file, will still be required at
|
||||
run time.
|
||||
|
||||
|
||||
Function calls
|
||||
|
||||
Function calls inside the analyzed fragment are reproduced
|
||||
in the CUDA or OpenCL code, but for now it is left to the user
|
||||
to make sure that the functions that are being called are
|
||||
available from the generated kernels.
|
||||
|
||||
In the case of OpenCL code, the --opencl-include-file option
|
||||
may be used to specify one or more files to be #include'd
|
||||
from the generated code. These files may then contain
|
||||
the definitions of the functions being called from the
|
||||
program fragment. If the pathnames of the included files
|
||||
are relative to the current directory, then you may need
|
||||
to additionally specify the --opencl-compiler-options=-I.
|
||||
to make sure that the files can be found by the OpenCL compiler.
|
||||
The included files may contain definitions of types used by the
|
||||
generated kernels. By default, PPCG generates definitions for
|
||||
types as needed, but these definitions may collide with those in
|
||||
the included files, as PPCG does not consider the contents of the
|
||||
included files. The --no-opencl-print-kernel-types will prevent
|
||||
PPCG from generating type definitions.
|
||||
|
||||
|
||||
GNU extensions
|
||||
|
||||
By default, PPCG may print out macro definitions that involve
|
||||
GNU extensions such as __typeof__ and statement expressions.
|
||||
Some compilers may not support these extensions.
|
||||
In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918)
|
||||
has been reported not to support __typeof__.
|
||||
The use of these extensions can be turned off with the
|
||||
--no-allow-gnu-extensions option.
|
||||
|
||||
|
||||
Processing PolyBench
|
||||
|
||||
When processing a PolyBench/C 3.2 benchmark, you should always specify
|
||||
-DPOLYBENCH_USE_C99_PROTO on the ppcg command line. Otherwise, the source
|
||||
files are inconsistent, having fixed size arrays but parametrically
|
||||
bounded loops iterating over them.
|
||||
However, you should not specify this define when compiling
|
||||
the PPCG generated code using nvcc since CUDA does not support VLAs.
|
||||
|
||||
|
||||
CUDA and function overloading
|
||||
|
||||
While CUDA supports function overloading based on the arguments types,
|
||||
no such function overloading exists in the input language C. Since PPCG
|
||||
simply prints out the same function name as in the original code, this
|
||||
may result in a different function being called based on the types
|
||||
of the arguments. For example, if the original code contains a call
|
||||
to the function sqrt() with a float argument, then the argument will
|
||||
be promoted to a double and the sqrt() function will be called.
|
||||
In the transformed (CUDA) code, however, overloading will cause the
|
||||
function sqrtf() to be called. Until this issue has been resolved in PPCG,
|
||||
we recommend that users either explicitly call the function sqrtf() or
|
||||
explicitly cast the argument to double in the input code.
|
||||
|
||||
|
||||
Contact
|
||||
|
||||
For bug reports, feature requests and questions,
|
||||
contact http://groups.google.com/group/isl-development
|
||||
|
||||
Whenever you report a bug, please mention the exact version of PPCG
|
||||
that you are using (output of "./ppcg --version"). If you are unable
|
||||
to compile PPCG, then report the git version (output of "git describe")
|
||||
or the version number included in the name of the tarball.
|
||||
|
||||
|
||||
Citing PPCG
|
||||
|
||||
If you use PPCG for your research, you are invited to cite
|
||||
the following paper.
|
||||
|
||||
@article{Verdoolaege2013PPCG,
|
||||
author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and
|
||||
G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and
|
||||
Catthoor, Francky},
|
||||
title = {Polyhedral parallel code generation for CUDA},
|
||||
journal = {ACM Trans. Archit. Code Optim.},
|
||||
issue_date = {January 2013},
|
||||
volume = {9},
|
||||
number = {4},
|
||||
month = jan,
|
||||
year = {2013},
|
||||
issn = {1544-3566},
|
||||
pages = {54:1--54:23},
|
||||
doi = {10.1145/2400682.2400713},
|
||||
acmid = {2400713},
|
||||
publisher = {ACM},
|
||||
address = {New York, NY, USA},
|
||||
}
|
||||
802
polly/lib/External/ppcg/cpu.c
vendored
802
polly/lib/External/ppcg/cpu.c
vendored
@ -1,802 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012 INRIA Paris-Rocquencourt
|
||||
* Copyright 2012 Ecole Normale Superieure
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Tobias Grosser, INRIA Paris-Rocquencourt,
|
||||
* Domaine de Voluceau, Rocquenqourt, B.P. 105,
|
||||
* 78153 Le Chesnay Cedex France
|
||||
* and Sven Verdoolaege,
|
||||
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
|
||||
*/
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <isl/aff.h>
|
||||
#include <isl/ctx.h>
|
||||
#include <isl/flow.h>
|
||||
#include <isl/map.h>
|
||||
#include <isl/ast_build.h>
|
||||
#include <isl/schedule.h>
|
||||
#include <isl/schedule_node.h>
|
||||
#include <pet.h>
|
||||
|
||||
#include "ppcg.h"
|
||||
#include "ppcg_options.h"
|
||||
#include "cpu.h"
|
||||
#include "print.h"
|
||||
#include "schedule.h"
|
||||
#include "util.h"
|
||||
|
||||
/* Representation of a statement inside a generated AST.
|
||||
*
|
||||
* "stmt" refers to the original statement.
|
||||
* "ref2expr" maps the reference identifier of each access in
|
||||
* the statement to an AST expression that should be printed
|
||||
* at the place of the access.
|
||||
*/
|
||||
struct ppcg_stmt {
|
||||
struct pet_stmt *stmt;
|
||||
|
||||
isl_id_to_ast_expr *ref2expr;
|
||||
};
|
||||
|
||||
static void ppcg_stmt_free(void *user)
|
||||
{
|
||||
struct ppcg_stmt *stmt = user;
|
||||
|
||||
if (!stmt)
|
||||
return;
|
||||
|
||||
isl_id_to_ast_expr_free(stmt->ref2expr);
|
||||
|
||||
free(stmt);
|
||||
}
|
||||
|
||||
/* Derive the output file name from the input file name.
|
||||
* 'input' is the entire path of the input file. The output
|
||||
* is the file name plus the additional extension.
|
||||
*
|
||||
* We will basically replace everything after the last point
|
||||
* with '.ppcg.c'. This means file.c becomes file.ppcg.c
|
||||
*/
|
||||
static FILE *get_output_file(const char *input, const char *output)
|
||||
{
|
||||
char name[PATH_MAX];
|
||||
const char *ext;
|
||||
const char ppcg_marker[] = ".ppcg";
|
||||
int len;
|
||||
FILE *file;
|
||||
|
||||
len = ppcg_extract_base_name(name, input);
|
||||
|
||||
strcpy(name + len, ppcg_marker);
|
||||
ext = strrchr(input, '.');
|
||||
strcpy(name + len + sizeof(ppcg_marker) - 1, ext ? ext : ".c");
|
||||
|
||||
if (!output)
|
||||
output = name;
|
||||
|
||||
file = fopen(output, "w");
|
||||
if (!file) {
|
||||
fprintf(stderr, "Unable to open '%s' for writing\n", output);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
/* Data used to annotate for nodes in the ast.
|
||||
*/
|
||||
struct ast_node_userinfo {
|
||||
/* The for node is an openmp parallel for node. */
|
||||
int is_openmp;
|
||||
};
|
||||
|
||||
/* Information used while building the ast.
|
||||
*/
|
||||
struct ast_build_userinfo {
|
||||
/* The current ppcg scop. */
|
||||
struct ppcg_scop *scop;
|
||||
|
||||
/* Are we currently in a parallel for loop? */
|
||||
int in_parallel_for;
|
||||
};
|
||||
|
||||
/* Check if the current scheduling dimension is parallel.
|
||||
*
|
||||
* We check for parallelism by verifying that the loop does not carry any
|
||||
* dependences.
|
||||
* If the live_range_reordering option is set, then this currently
|
||||
* includes the order dependences. In principle, non-zero order dependences
|
||||
* could be allowed, but this would require privatization and/or expansion.
|
||||
*
|
||||
* Parallelism test: if the distance is zero in all outer dimensions, then it
|
||||
* has to be zero in the current dimension as well.
|
||||
* Implementation: first, translate dependences into time space, then force
|
||||
* outer dimensions to be equal. If the distance is zero in the current
|
||||
* dimension, then the loop is parallel.
|
||||
* The distance is zero in the current dimension if it is a subset of a map
|
||||
* with equal values for the current dimension.
|
||||
*/
|
||||
static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build,
|
||||
struct ppcg_scop *scop)
|
||||
{
|
||||
isl_union_map *schedule, *deps;
|
||||
isl_map *schedule_deps, *test;
|
||||
isl_space *schedule_space;
|
||||
unsigned i, dimension, is_parallel;
|
||||
|
||||
schedule = isl_ast_build_get_schedule(build);
|
||||
schedule_space = isl_ast_build_get_schedule_space(build);
|
||||
|
||||
dimension = isl_space_dim(schedule_space, isl_dim_out) - 1;
|
||||
|
||||
deps = isl_union_map_copy(scop->dep_flow);
|
||||
deps = isl_union_map_union(deps, isl_union_map_copy(scop->dep_false));
|
||||
if (scop->options->live_range_reordering) {
|
||||
isl_union_map *order = isl_union_map_copy(scop->dep_order);
|
||||
deps = isl_union_map_union(deps, order);
|
||||
}
|
||||
deps = isl_union_map_apply_range(deps, isl_union_map_copy(schedule));
|
||||
deps = isl_union_map_apply_domain(deps, schedule);
|
||||
|
||||
if (isl_union_map_is_empty(deps)) {
|
||||
isl_union_map_free(deps);
|
||||
isl_space_free(schedule_space);
|
||||
return 1;
|
||||
}
|
||||
|
||||
schedule_deps = isl_map_from_union_map(deps);
|
||||
|
||||
for (i = 0; i < dimension; i++)
|
||||
schedule_deps = isl_map_equate(schedule_deps, isl_dim_out, i,
|
||||
isl_dim_in, i);
|
||||
|
||||
test = isl_map_universe(isl_map_get_space(schedule_deps));
|
||||
test = isl_map_equate(test, isl_dim_out, dimension, isl_dim_in,
|
||||
dimension);
|
||||
is_parallel = isl_map_is_subset(schedule_deps, test);
|
||||
|
||||
isl_space_free(schedule_space);
|
||||
isl_map_free(test);
|
||||
isl_map_free(schedule_deps);
|
||||
|
||||
return is_parallel;
|
||||
}
|
||||
|
||||
/* Mark a for node openmp parallel, if it is the outermost parallel for node.
|
||||
*/
|
||||
static void mark_openmp_parallel(__isl_keep isl_ast_build *build,
|
||||
struct ast_build_userinfo *build_info,
|
||||
struct ast_node_userinfo *node_info)
|
||||
{
|
||||
if (build_info->in_parallel_for)
|
||||
return;
|
||||
|
||||
if (ast_schedule_dim_is_parallel(build, build_info->scop)) {
|
||||
build_info->in_parallel_for = 1;
|
||||
node_info->is_openmp = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Allocate an ast_node_info structure and initialize it with default values.
|
||||
*/
|
||||
static struct ast_node_userinfo *allocate_ast_node_userinfo()
|
||||
{
|
||||
struct ast_node_userinfo *node_info;
|
||||
node_info = (struct ast_node_userinfo *)
|
||||
malloc(sizeof(struct ast_node_userinfo));
|
||||
node_info->is_openmp = 0;
|
||||
return node_info;
|
||||
}
|
||||
|
||||
/* Free an ast_node_info structure.
|
||||
*/
|
||||
static void free_ast_node_userinfo(void *ptr)
|
||||
{
|
||||
struct ast_node_userinfo *info;
|
||||
info = (struct ast_node_userinfo *) ptr;
|
||||
free(info);
|
||||
}
|
||||
|
||||
/* This method is executed before the construction of a for node. It creates
|
||||
* an isl_id that is used to annotate the subsequently generated ast for nodes.
|
||||
*
|
||||
* In this function we also run the following analyses:
|
||||
*
|
||||
* - Detection of openmp parallel loops
|
||||
*/
|
||||
static __isl_give isl_id *ast_build_before_for(
|
||||
__isl_keep isl_ast_build *build, void *user)
|
||||
{
|
||||
isl_id *id;
|
||||
struct ast_build_userinfo *build_info;
|
||||
struct ast_node_userinfo *node_info;
|
||||
|
||||
build_info = (struct ast_build_userinfo *) user;
|
||||
node_info = allocate_ast_node_userinfo();
|
||||
id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info);
|
||||
id = isl_id_set_free_user(id, free_ast_node_userinfo);
|
||||
|
||||
mark_openmp_parallel(build, build_info, node_info);
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
/* This method is executed after the construction of a for node.
|
||||
*
|
||||
* It performs the following actions:
|
||||
*
|
||||
* - Reset the 'in_parallel_for' flag, as soon as we leave a for node,
|
||||
* that is marked as openmp parallel.
|
||||
*
|
||||
*/
|
||||
static __isl_give isl_ast_node *ast_build_after_for(
|
||||
__isl_take isl_ast_node *node, __isl_keep isl_ast_build *build,
|
||||
void *user)
|
||||
{
|
||||
isl_id *id;
|
||||
struct ast_build_userinfo *build_info;
|
||||
struct ast_node_userinfo *info;
|
||||
|
||||
id = isl_ast_node_get_annotation(node);
|
||||
info = isl_id_get_user(id);
|
||||
|
||||
if (info && info->is_openmp) {
|
||||
build_info = (struct ast_build_userinfo *) user;
|
||||
build_info->in_parallel_for = 0;
|
||||
}
|
||||
|
||||
isl_id_free(id);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Find the element in scop->stmts that has the given "id".
|
||||
*/
|
||||
static struct pet_stmt *find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < scop->pet->n_stmt; ++i) {
|
||||
struct pet_stmt *stmt = scop->pet->stmts[i];
|
||||
isl_id *id_i;
|
||||
|
||||
id_i = isl_set_get_tuple_id(stmt->domain);
|
||||
isl_id_free(id_i);
|
||||
|
||||
if (id_i == id)
|
||||
return stmt;
|
||||
}
|
||||
|
||||
isl_die(isl_id_get_ctx(id), isl_error_internal,
|
||||
"statement not found", return NULL);
|
||||
}
|
||||
|
||||
/* Print a user statement in the generated AST.
|
||||
* The ppcg_stmt has been attached to the node in at_each_domain.
|
||||
*/
|
||||
static __isl_give isl_printer *print_user(__isl_take isl_printer *p,
|
||||
__isl_take isl_ast_print_options *print_options,
|
||||
__isl_keep isl_ast_node *node, void *user)
|
||||
{
|
||||
struct ppcg_stmt *stmt;
|
||||
isl_id *id;
|
||||
|
||||
id = isl_ast_node_get_annotation(node);
|
||||
stmt = isl_id_get_user(id);
|
||||
isl_id_free(id);
|
||||
|
||||
p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr);
|
||||
|
||||
isl_ast_print_options_free(print_options);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
/* Print a for loop node as an openmp parallel loop.
|
||||
*
|
||||
* To print an openmp parallel loop we print a normal for loop, but add
|
||||
* "#pragma openmp parallel for" in front.
|
||||
*
|
||||
* Variables that are declared within the body of this for loop are
|
||||
* automatically openmp 'private'. Iterators declared outside of the
|
||||
* for loop are automatically openmp 'shared'. As ppcg declares all iterators
|
||||
* at the position where they are assigned, there is no need to explicitly mark
|
||||
* variables. Their automatically assigned type is already correct.
|
||||
*
|
||||
* This function only generates valid OpenMP code, if the ast was generated
|
||||
* with the 'atomic-bounds' option enabled.
|
||||
*
|
||||
*/
|
||||
static __isl_give isl_printer *print_for_with_openmp(
|
||||
__isl_keep isl_ast_node *node, __isl_take isl_printer *p,
|
||||
__isl_take isl_ast_print_options *print_options)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "#pragma omp parallel for");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
p = isl_ast_node_for_print(node, p, print_options);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print a for node.
|
||||
*
|
||||
* Depending on how the node is annotated, we either print a normal
|
||||
* for node or an openmp parallel for node.
|
||||
*/
|
||||
static __isl_give isl_printer *print_for(__isl_take isl_printer *p,
|
||||
__isl_take isl_ast_print_options *print_options,
|
||||
__isl_keep isl_ast_node *node, void *user)
|
||||
{
|
||||
isl_id *id;
|
||||
int openmp;
|
||||
|
||||
openmp = 0;
|
||||
id = isl_ast_node_get_annotation(node);
|
||||
|
||||
if (id) {
|
||||
struct ast_node_userinfo *info;
|
||||
|
||||
info = (struct ast_node_userinfo *) isl_id_get_user(id);
|
||||
if (info && info->is_openmp)
|
||||
openmp = 1;
|
||||
}
|
||||
|
||||
if (openmp)
|
||||
p = print_for_with_openmp(node, p, print_options);
|
||||
else
|
||||
p = isl_ast_node_for_print(node, p, print_options);
|
||||
|
||||
isl_id_free(id);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Index transformation callback for pet_stmt_build_ast_exprs.
|
||||
*
|
||||
* "index" expresses the array indices in terms of statement iterators
|
||||
* "iterator_map" expresses the statement iterators in terms of
|
||||
* AST loop iterators.
|
||||
*
|
||||
* The result expresses the array indices in terms of
|
||||
* AST loop iterators.
|
||||
*/
|
||||
static __isl_give isl_multi_pw_aff *pullback_index(
|
||||
__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user)
|
||||
{
|
||||
isl_pw_multi_aff *iterator_map = user;
|
||||
|
||||
iterator_map = isl_pw_multi_aff_copy(iterator_map);
|
||||
return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
|
||||
}
|
||||
|
||||
/* Transform the accesses in the statement associated to the domain
|
||||
* called by "node" to refer to the AST loop iterators, construct
|
||||
* corresponding AST expressions using "build",
|
||||
* collect them in a ppcg_stmt and annotate the node with the ppcg_stmt.
|
||||
*/
|
||||
static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
|
||||
__isl_keep isl_ast_build *build, void *user)
|
||||
{
|
||||
struct ppcg_scop *scop = user;
|
||||
isl_ast_expr *expr, *arg;
|
||||
isl_ctx *ctx;
|
||||
isl_id *id;
|
||||
isl_map *map;
|
||||
isl_pw_multi_aff *iterator_map;
|
||||
struct ppcg_stmt *stmt;
|
||||
|
||||
ctx = isl_ast_node_get_ctx(node);
|
||||
stmt = isl_calloc_type(ctx, struct ppcg_stmt);
|
||||
if (!stmt)
|
||||
goto error;
|
||||
|
||||
expr = isl_ast_node_user_get_expr(node);
|
||||
arg = isl_ast_expr_get_op_arg(expr, 0);
|
||||
isl_ast_expr_free(expr);
|
||||
id = isl_ast_expr_get_id(arg);
|
||||
isl_ast_expr_free(arg);
|
||||
stmt->stmt = find_stmt(scop, id);
|
||||
isl_id_free(id);
|
||||
if (!stmt->stmt)
|
||||
goto error;
|
||||
|
||||
map = isl_map_from_union_map(isl_ast_build_get_schedule(build));
|
||||
map = isl_map_reverse(map);
|
||||
iterator_map = isl_pw_multi_aff_from_map(map);
|
||||
stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build,
|
||||
&pullback_index, iterator_map, NULL, NULL);
|
||||
isl_pw_multi_aff_free(iterator_map);
|
||||
|
||||
id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt);
|
||||
id = isl_id_set_free_user(id, &ppcg_stmt_free);
|
||||
return isl_ast_node_set_annotation(node, id);
|
||||
error:
|
||||
ppcg_stmt_free(stmt);
|
||||
return isl_ast_node_free(node);
|
||||
}
|
||||
|
||||
/* Set *depth (initialized to 0 by the caller) to the maximum
|
||||
* of the schedule depths of the leaf nodes for which this function is called.
|
||||
*/
|
||||
static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
|
||||
{
|
||||
int *depth = user;
|
||||
int node_depth;
|
||||
|
||||
if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
|
||||
return isl_bool_true;
|
||||
node_depth = isl_schedule_node_get_schedule_depth(node);
|
||||
if (node_depth > *depth)
|
||||
*depth = node_depth;
|
||||
|
||||
return isl_bool_false;
|
||||
}
|
||||
|
||||
/* This function is called for each node in a CPU AST.
|
||||
* In case of a user node, print the macro definitions required
|
||||
* for printing the AST expressions in the annotation, if any.
|
||||
* For other nodes, return true such that descendants are also
|
||||
* visited.
|
||||
*
|
||||
* In particular, print the macro definitions needed for the substitutions
|
||||
* of the original user statements.
|
||||
*/
|
||||
static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
|
||||
{
|
||||
struct ppcg_stmt *stmt;
|
||||
isl_id *id;
|
||||
isl_printer **p = user;
|
||||
|
||||
if (isl_ast_node_get_type(node) != isl_ast_node_user)
|
||||
return isl_bool_true;
|
||||
|
||||
id = isl_ast_node_get_annotation(node);
|
||||
stmt = isl_id_get_user(id);
|
||||
isl_id_free(id);
|
||||
|
||||
if (!stmt)
|
||||
return isl_bool_error;
|
||||
|
||||
*p = ppcg_print_body_macros(*p, stmt->ref2expr);
|
||||
if (!*p)
|
||||
return isl_bool_error;
|
||||
|
||||
return isl_bool_false;
|
||||
}
|
||||
|
||||
/* Print the required macros for the CPU AST "node" to "p",
|
||||
* including those needed for the user statements inside the AST.
|
||||
*/
|
||||
static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p,
|
||||
__isl_keep isl_ast_node *node)
|
||||
{
|
||||
if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
|
||||
return isl_printer_free(p);
|
||||
p = ppcg_print_macros(p, node);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Code generate the scop 'scop' using "schedule"
|
||||
* and print the corresponding C code to 'p'.
|
||||
*/
|
||||
static __isl_give isl_printer *print_scop(struct ppcg_scop *scop,
|
||||
__isl_take isl_schedule *schedule, __isl_take isl_printer *p,
|
||||
struct ppcg_options *options)
|
||||
{
|
||||
isl_ctx *ctx = isl_printer_get_ctx(p);
|
||||
isl_ast_build *build;
|
||||
isl_ast_print_options *print_options;
|
||||
isl_ast_node *tree;
|
||||
isl_id_list *iterators;
|
||||
struct ast_build_userinfo build_info;
|
||||
int depth;
|
||||
|
||||
depth = 0;
|
||||
if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
|
||||
&depth) < 0)
|
||||
goto error;
|
||||
|
||||
build = isl_ast_build_alloc(ctx);
|
||||
iterators = ppcg_scop_generate_names(scop, depth, "c");
|
||||
build = isl_ast_build_set_iterators(build, iterators);
|
||||
build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop);
|
||||
|
||||
if (options->openmp) {
|
||||
build_info.scop = scop;
|
||||
build_info.in_parallel_for = 0;
|
||||
|
||||
build = isl_ast_build_set_before_each_for(build,
|
||||
&ast_build_before_for,
|
||||
&build_info);
|
||||
build = isl_ast_build_set_after_each_for(build,
|
||||
&ast_build_after_for,
|
||||
&build_info);
|
||||
}
|
||||
|
||||
tree = isl_ast_build_node_from_schedule(build, schedule);
|
||||
isl_ast_build_free(build);
|
||||
|
||||
print_options = isl_ast_print_options_alloc(ctx);
|
||||
print_options = isl_ast_print_options_set_print_user(print_options,
|
||||
&print_user, NULL);
|
||||
|
||||
print_options = isl_ast_print_options_set_print_for(print_options,
|
||||
&print_for, NULL);
|
||||
|
||||
p = cpu_print_macros(p, tree);
|
||||
p = isl_ast_node_print(tree, p, print_options);
|
||||
|
||||
isl_ast_node_free(tree);
|
||||
|
||||
return p;
|
||||
error:
|
||||
isl_schedule_free(schedule);
|
||||
isl_printer_free(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Tile the band node "node" with tile sizes "sizes" and
|
||||
* mark all members of the resulting tile node as "atomic".
|
||||
*/
|
||||
static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node,
|
||||
__isl_take isl_multi_val *sizes)
|
||||
{
|
||||
node = isl_schedule_node_band_tile(node, sizes);
|
||||
node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Tile "node", if it is a band node with at least 2 members.
|
||||
* The tile sizes are set from the "tile_size" option.
|
||||
*/
|
||||
static __isl_give isl_schedule_node *tile_band(
|
||||
__isl_take isl_schedule_node *node, void *user)
|
||||
{
|
||||
struct ppcg_scop *scop = user;
|
||||
int n;
|
||||
isl_space *space;
|
||||
isl_multi_val *sizes;
|
||||
|
||||
if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
|
||||
return node;
|
||||
|
||||
n = isl_schedule_node_band_n_member(node);
|
||||
if (n <= 1)
|
||||
return node;
|
||||
|
||||
space = isl_schedule_node_band_get_space(node);
|
||||
sizes = ppcg_multi_val_from_int(space, scop->options->tile_size);
|
||||
|
||||
return tile(node, sizes);
|
||||
}
|
||||
|
||||
/* Construct schedule constraints from the dependences in ps
|
||||
* for the purpose of computing a schedule for a CPU.
|
||||
*
|
||||
* The proximity constraints are set to the flow dependences.
|
||||
*
|
||||
* If live-range reordering is allowed then the conditional validity
|
||||
* constraints are set to the order dependences with the flow dependences
|
||||
* as condition. That is, a live-range (flow dependence) will be either
|
||||
* local to an iteration of a band or all adjacent order dependences
|
||||
* will be respected by the band.
|
||||
* The validity constraints are set to the union of the flow dependences
|
||||
* and the forced dependences, while the coincidence constraints
|
||||
* are set to the union of the flow dependences, the forced dependences and
|
||||
* the order dependences.
|
||||
*
|
||||
* If live-range reordering is not allowed, then both the validity
|
||||
* and the coincidence constraints are set to the union of the flow
|
||||
* dependences and the false dependences.
|
||||
*
|
||||
* Note that the coincidence constraints are only set when the "openmp"
|
||||
* options is set. Even though the way openmp pragmas are introduced
|
||||
* does not rely on the coincident property of the schedule band members,
|
||||
* the coincidence constraints do affect the way the schedule is constructed,
|
||||
* such that more schedule dimensions should be detected as parallel
|
||||
* by ast_schedule_dim_is_parallel.
|
||||
* Since the order dependences are also taken into account by
|
||||
* ast_schedule_dim_is_parallel, they are also added to
|
||||
* the coincidence constraints. If the openmp handling learns
|
||||
* how to privatize some memory, then the corresponding order
|
||||
* dependences can be removed from the coincidence constraints.
|
||||
*/
|
||||
static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints(
|
||||
struct ppcg_scop *ps)
|
||||
{
|
||||
isl_schedule_constraints *sc;
|
||||
isl_union_map *validity, *coincidence;
|
||||
|
||||
sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain));
|
||||
if (ps->options->live_range_reordering) {
|
||||
sc = isl_schedule_constraints_set_conditional_validity(sc,
|
||||
isl_union_map_copy(ps->tagged_dep_flow),
|
||||
isl_union_map_copy(ps->tagged_dep_order));
|
||||
validity = isl_union_map_copy(ps->dep_flow);
|
||||
validity = isl_union_map_union(validity,
|
||||
isl_union_map_copy(ps->dep_forced));
|
||||
if (ps->options->openmp) {
|
||||
coincidence = isl_union_map_copy(validity);
|
||||
coincidence = isl_union_map_union(coincidence,
|
||||
isl_union_map_copy(ps->dep_order));
|
||||
}
|
||||
} else {
|
||||
validity = isl_union_map_copy(ps->dep_flow);
|
||||
validity = isl_union_map_union(validity,
|
||||
isl_union_map_copy(ps->dep_false));
|
||||
if (ps->options->openmp)
|
||||
coincidence = isl_union_map_copy(validity);
|
||||
}
|
||||
if (ps->options->openmp)
|
||||
sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
|
||||
sc = isl_schedule_constraints_set_validity(sc, validity);
|
||||
sc = isl_schedule_constraints_set_proximity(sc,
|
||||
isl_union_map_copy(ps->dep_flow));
|
||||
|
||||
return sc;
|
||||
}
|
||||
|
||||
/* Compute a schedule for the scop "ps".
|
||||
*
|
||||
* First derive the appropriate schedule constraints from the dependences
|
||||
* in "ps" and then compute a schedule from those schedule constraints,
|
||||
* possibly grouping statement instances based on the input schedule.
|
||||
*/
|
||||
static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps)
|
||||
{
|
||||
isl_schedule_constraints *sc;
|
||||
isl_schedule *schedule;
|
||||
|
||||
if (!ps)
|
||||
return NULL;
|
||||
|
||||
sc = construct_cpu_schedule_constraints(ps);
|
||||
|
||||
if (ps->options->debug->dump_schedule_constraints)
|
||||
isl_schedule_constraints_dump(sc);
|
||||
schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options);
|
||||
|
||||
return schedule;
|
||||
}
|
||||
|
||||
/* Compute a new schedule to the scop "ps" if the reschedule option is set.
|
||||
* Otherwise, return a copy of the original schedule.
|
||||
*/
|
||||
static __isl_give isl_schedule *optionally_compute_schedule(void *user)
|
||||
{
|
||||
struct ppcg_scop *ps = user;
|
||||
|
||||
if (!ps)
|
||||
return NULL;
|
||||
if (!ps->options->reschedule)
|
||||
return isl_schedule_copy(ps->schedule);
|
||||
return compute_cpu_schedule(ps);
|
||||
}
|
||||
|
||||
/* Compute a schedule based on the dependences in "ps" and
|
||||
* tile it if requested by the user.
|
||||
*/
|
||||
static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps,
|
||||
struct ppcg_options *options)
|
||||
{
|
||||
isl_ctx *ctx;
|
||||
isl_schedule *schedule;
|
||||
|
||||
if (!ps)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_union_set_get_ctx(ps->domain);
|
||||
schedule = ppcg_get_schedule(ctx, options,
|
||||
&optionally_compute_schedule, ps);
|
||||
if (ps->options->tile)
|
||||
schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
|
||||
&tile_band, ps);
|
||||
|
||||
return schedule;
|
||||
}
|
||||
|
||||
/* Generate CPU code for the scop "ps" using "schedule" and
|
||||
* print the corresponding C code to "p", including variable declarations.
|
||||
*/
|
||||
static __isl_give isl_printer *print_cpu_with_schedule(
|
||||
__isl_take isl_printer *p, struct ppcg_scop *ps,
|
||||
__isl_take isl_schedule *schedule, struct ppcg_options *options)
|
||||
{
|
||||
int hidden;
|
||||
isl_set *context;
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "/* ppcg generated CPU code */");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
p = ppcg_set_macro_names(p);
|
||||
p = ppcg_print_exposed_declarations(p, ps);
|
||||
hidden = ppcg_scop_any_hidden_declarations(ps);
|
||||
if (hidden) {
|
||||
p = ppcg_start_block(p);
|
||||
p = ppcg_print_hidden_declarations(p, ps);
|
||||
}
|
||||
|
||||
context = isl_set_copy(ps->context);
|
||||
context = isl_set_from_params(context);
|
||||
schedule = isl_schedule_insert_context(schedule, context);
|
||||
if (options->debug->dump_final_schedule)
|
||||
isl_schedule_dump(schedule);
|
||||
p = print_scop(ps, schedule, p, options);
|
||||
if (hidden)
|
||||
p = ppcg_end_block(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Generate CPU code for the scop "ps" and print the corresponding C code
|
||||
* to "p", including variable declarations.
|
||||
*/
|
||||
__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
|
||||
struct ppcg_scop *ps, struct ppcg_options *options)
|
||||
{
|
||||
isl_schedule *schedule;
|
||||
|
||||
schedule = isl_schedule_copy(ps->schedule);
|
||||
return print_cpu_with_schedule(p, ps, schedule, options);
|
||||
}
|
||||
|
||||
/* Generate CPU code for "scop" and print it to "p".
|
||||
*
|
||||
* First obtain a schedule for "scop" and then print code for "scop"
|
||||
* using that schedule.
|
||||
*/
|
||||
static __isl_give isl_printer *generate(__isl_take isl_printer *p,
|
||||
struct ppcg_scop *scop, struct ppcg_options *options)
|
||||
{
|
||||
isl_schedule *schedule;
|
||||
|
||||
schedule = get_schedule(scop, options);
|
||||
|
||||
return print_cpu_with_schedule(p, scop, schedule, options);
|
||||
}
|
||||
|
||||
/* Wrapper around generate for use as a ppcg_transform callback.
|
||||
*/
|
||||
static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p,
|
||||
struct ppcg_scop *scop, void *user)
|
||||
{
|
||||
struct ppcg_options *options = user;
|
||||
|
||||
return generate(p, scop, options);
|
||||
}
|
||||
|
||||
/* Transform the code in the file called "input" by replacing
|
||||
* all scops by corresponding CPU code and write the results to a file
|
||||
* called "output".
|
||||
*/
|
||||
int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
|
||||
const char *input, const char *output)
|
||||
{
|
||||
FILE *output_file;
|
||||
int r;
|
||||
|
||||
output_file = get_output_file(input, output);
|
||||
if (!output_file)
|
||||
return -1;
|
||||
|
||||
r = ppcg_transform(ctx, input, output_file, options,
|
||||
&print_cpu_wrap, options);
|
||||
|
||||
fclose(output_file);
|
||||
|
||||
return r;
|
||||
}
|
||||
15
polly/lib/External/ppcg/cpu.h
vendored
15
polly/lib/External/ppcg/cpu.h
vendored
@ -1,15 +0,0 @@
|
||||
#ifndef _CPU_H
|
||||
#define _CPU_H
|
||||
|
||||
#include <isl/ctx.h>
|
||||
|
||||
#include "ppcg.h"
|
||||
|
||||
struct ppcg_options;
|
||||
|
||||
__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
|
||||
struct ppcg_scop *ps, struct ppcg_options *options);
|
||||
int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
|
||||
const char *input, const char *output);
|
||||
|
||||
#endif
|
||||
730
polly/lib/External/ppcg/cuda.c
vendored
730
polly/lib/External/ppcg/cuda.c
vendored
@ -1,730 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012 Ecole Normale Superieure
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege,
|
||||
* Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
|
||||
*/
|
||||
|
||||
#include <isl/aff.h>
|
||||
#include <isl/ast.h>
|
||||
|
||||
#include "cuda_common.h"
|
||||
#include "cuda.h"
|
||||
#include "gpu.h"
|
||||
#include "gpu_print.h"
|
||||
#include "print.h"
|
||||
#include "util.h"
|
||||
|
||||
static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
|
||||
{
|
||||
const char *macros =
|
||||
"#define cudaCheckReturn(ret) \\\n"
|
||||
" do { \\\n"
|
||||
" cudaError_t cudaCheckReturn_e = (ret); \\\n"
|
||||
" if (cudaCheckReturn_e != cudaSuccess) { \\\n"
|
||||
" fprintf(stderr, \"CUDA error: %s\\n\", "
|
||||
"cudaGetErrorString(cudaCheckReturn_e)); \\\n"
|
||||
" fflush(stderr); \\\n"
|
||||
" } \\\n"
|
||||
" assert(cudaCheckReturn_e == cudaSuccess); \\\n"
|
||||
" } while(0)\n"
|
||||
"#define cudaCheckKernel() \\\n"
|
||||
" do { \\\n"
|
||||
" cudaCheckReturn(cudaGetLastError()); \\\n"
|
||||
" } while(0)\n\n";
|
||||
|
||||
p = isl_printer_print_str(p, macros);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print a declaration for the device array corresponding to "array" on "p".
|
||||
*/
|
||||
static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
|
||||
struct gpu_array_info *array)
|
||||
{
|
||||
int i;
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, array->type);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
if (!array->linearize && array->n_index > 1)
|
||||
p = isl_printer_print_str(p, "(");
|
||||
p = isl_printer_print_str(p, "*dev_");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
if (!array->linearize && array->n_index > 1) {
|
||||
p = isl_printer_print_str(p, ")");
|
||||
for (i = 1; i < array->n_index; i++) {
|
||||
isl_ast_expr *bound;
|
||||
bound = isl_ast_expr_get_op_arg(array->bound_expr,
|
||||
1 + i);
|
||||
p = isl_printer_print_str(p, "[");
|
||||
p = isl_printer_print_ast_expr(p, bound);
|
||||
p = isl_printer_print_str(p, "]");
|
||||
isl_ast_expr_free(bound);
|
||||
}
|
||||
}
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < prog->n_array; ++i) {
|
||||
if (!gpu_array_requires_device_allocation(&prog->array[i]))
|
||||
continue;
|
||||
|
||||
p = declare_device_array(p, &prog->array[i]);
|
||||
}
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_end_line(p);
|
||||
return p;
|
||||
}
|
||||
|
||||
static __isl_give isl_printer *allocate_device_arrays(
|
||||
__isl_take isl_printer *p, struct gpu_prog *prog)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < prog->n_array; ++i) {
|
||||
struct gpu_array_info *array = &prog->array[i];
|
||||
|
||||
if (!gpu_array_requires_device_allocation(&prog->array[i]))
|
||||
continue;
|
||||
p = ppcg_ast_expr_print_macros(array->bound_expr, p);
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p,
|
||||
"cudaCheckReturn(cudaMalloc((void **) &dev_");
|
||||
p = isl_printer_print_str(p, prog->array[i].name);
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
p = gpu_array_info_print_size(p, &prog->array[i]);
|
||||
p = isl_printer_print_str(p, "));");
|
||||
p = isl_printer_end_line(p);
|
||||
}
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_end_line(p);
|
||||
return p;
|
||||
}
|
||||
|
||||
static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < prog->n_array; ++i) {
|
||||
if (!gpu_array_requires_device_allocation(&prog->array[i]))
|
||||
continue;
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
|
||||
p = isl_printer_print_str(p, prog->array[i].name);
|
||||
p = isl_printer_print_str(p, "));");
|
||||
p = isl_printer_end_line(p);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print code to "p" for copying "array" from the host to the device
|
||||
* in its entirety. The bounds on the extent of "array" have
|
||||
* been precomputed in extract_array_info and are used in
|
||||
* gpu_array_info_print_size.
|
||||
*/
|
||||
static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
|
||||
struct gpu_array_info *array)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
|
||||
if (gpu_array_is_scalar(array))
|
||||
p = isl_printer_print_str(p, "&");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
|
||||
p = gpu_array_info_print_size(p, array);
|
||||
p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print code to "p" for copying "array" back from the device to the host
|
||||
* in its entirety. The bounds on the extent of "array" have
|
||||
* been precomputed in extract_array_info and are used in
|
||||
* gpu_array_info_print_size.
|
||||
*/
|
||||
static __isl_give isl_printer *copy_array_from_device(
|
||||
__isl_take isl_printer *p, struct gpu_array_info *array)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
|
||||
if (gpu_array_is_scalar(array))
|
||||
p = isl_printer_print_str(p, "&");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
p = isl_printer_print_str(p, ", dev_");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
p = gpu_array_info_print_size(p, array);
|
||||
p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static __isl_give isl_printer* print_reverse_list(__isl_take isl_printer *p, int len, int *list)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (len == 0)
|
||||
return p;
|
||||
|
||||
p = isl_printer_print_str(p, "(");
|
||||
for (i = 0; i < len; ++i) {
|
||||
if (i)
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
p = isl_printer_print_int(p, list[len - 1 - i]);
|
||||
}
|
||||
return isl_printer_print_str(p, ")");
|
||||
}
|
||||
|
||||
/* Print the effective grid size as a list of the sizes in each
|
||||
* dimension, from innermost to outermost.
|
||||
*/
|
||||
static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int i;
|
||||
int dim;
|
||||
|
||||
dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
|
||||
if (dim == 0)
|
||||
return p;
|
||||
|
||||
p = isl_printer_print_str(p, "(");
|
||||
for (i = dim - 1; i >= 0; --i) {
|
||||
isl_ast_expr *bound;
|
||||
|
||||
bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
|
||||
p = isl_printer_print_ast_expr(p, bound);
|
||||
isl_ast_expr_free(bound);
|
||||
|
||||
if (i > 0)
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
}
|
||||
|
||||
p = isl_printer_print_str(p, ")");
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print the grid definition.
|
||||
*/
|
||||
static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "dim3 k");
|
||||
p = isl_printer_print_int(p, kernel->id);
|
||||
p = isl_printer_print_str(p, "_dimGrid");
|
||||
p = print_grid_size(p, kernel);
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print the arguments to a kernel declaration or call. If "types" is set,
|
||||
* then print a declaration (including the types of the arguments).
|
||||
*
|
||||
* The arguments are printed in the following order
|
||||
* - the arrays accessed by the kernel
|
||||
* - the parameters
|
||||
* - the host loop iterators
|
||||
*/
|
||||
static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
|
||||
{
|
||||
int i, n;
|
||||
int first = 1;
|
||||
unsigned nparam;
|
||||
isl_space *space;
|
||||
const char *type;
|
||||
|
||||
for (i = 0; i < prog->n_array; ++i) {
|
||||
int required;
|
||||
|
||||
required = ppcg_kernel_requires_array_argument(kernel, i);
|
||||
if (required < 0)
|
||||
return isl_printer_free(p);
|
||||
if (!required)
|
||||
continue;
|
||||
|
||||
if (!first)
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
|
||||
if (types)
|
||||
p = gpu_array_info_print_declaration_argument(p,
|
||||
&prog->array[i], NULL);
|
||||
else
|
||||
p = gpu_array_info_print_call_argument(p,
|
||||
&prog->array[i]);
|
||||
|
||||
first = 0;
|
||||
}
|
||||
|
||||
space = isl_union_set_get_space(kernel->arrays);
|
||||
nparam = isl_space_dim(space, isl_dim_param);
|
||||
for (i = 0; i < nparam; ++i) {
|
||||
const char *name;
|
||||
|
||||
name = isl_space_get_dim_name(space, isl_dim_param, i);
|
||||
|
||||
if (!first)
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
if (types)
|
||||
p = isl_printer_print_str(p, "int ");
|
||||
p = isl_printer_print_str(p, name);
|
||||
|
||||
first = 0;
|
||||
}
|
||||
isl_space_free(space);
|
||||
|
||||
n = isl_space_dim(kernel->space, isl_dim_set);
|
||||
type = isl_options_get_ast_iterator_type(prog->ctx);
|
||||
for (i = 0; i < n; ++i) {
|
||||
const char *name;
|
||||
|
||||
if (!first)
|
||||
p = isl_printer_print_str(p, ", ");
|
||||
name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
|
||||
if (types) {
|
||||
p = isl_printer_print_str(p, type);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
}
|
||||
p = isl_printer_print_str(p, name);
|
||||
|
||||
first = 0;
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print the header of the given kernel.
|
||||
*/
|
||||
static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog, struct ppcg_kernel *kernel)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "__global__ void kernel");
|
||||
p = isl_printer_print_int(p, kernel->id);
|
||||
p = isl_printer_print_str(p, "(");
|
||||
p = print_kernel_arguments(p, prog, kernel, 1);
|
||||
p = isl_printer_print_str(p, ")");
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print the header of the given kernel to both gen->cuda.kernel_h
|
||||
* and gen->cuda.kernel_c.
|
||||
*/
|
||||
static void print_kernel_headers(struct gpu_prog *prog,
|
||||
struct ppcg_kernel *kernel, struct cuda_info *cuda)
|
||||
{
|
||||
isl_printer *p;
|
||||
|
||||
p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
|
||||
p = isl_printer_set_output_format(p, ISL_FORMAT_C);
|
||||
p = print_kernel_header(p, prog, kernel);
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
isl_printer_free(p);
|
||||
|
||||
p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
|
||||
p = isl_printer_set_output_format(p, ISL_FORMAT_C);
|
||||
p = print_kernel_header(p, prog, kernel);
|
||||
p = isl_printer_end_line(p);
|
||||
isl_printer_free(p);
|
||||
}
|
||||
|
||||
static void print_indent(FILE *dst, int indent)
|
||||
{
|
||||
fprintf(dst, "%*s", indent, "");
|
||||
}
|
||||
|
||||
/* Print a list of iterators of type "type" with names "ids" to "out".
|
||||
* Each iterator is assigned one of the cuda identifiers in cuda_dims.
|
||||
* In particular, the last iterator is assigned the x identifier
|
||||
* (the first in the list of cuda identifiers).
|
||||
*/
|
||||
static void print_iterators(FILE *out, const char *type,
|
||||
__isl_keep isl_id_list *ids, const char *cuda_dims[])
|
||||
{
|
||||
int i, n;
|
||||
|
||||
n = isl_id_list_n_id(ids);
|
||||
if (n <= 0)
|
||||
return;
|
||||
print_indent(out, 4);
|
||||
fprintf(out, "%s ", type);
|
||||
for (i = 0; i < n; ++i) {
|
||||
isl_id *id;
|
||||
|
||||
if (i)
|
||||
fprintf(out, ", ");
|
||||
id = isl_id_list_get_id(ids, i);
|
||||
fprintf(out, "%s = %s", isl_id_get_name(id),
|
||||
cuda_dims[n - 1 - i]);
|
||||
isl_id_free(id);
|
||||
}
|
||||
fprintf(out, ";\n");
|
||||
}
|
||||
|
||||
static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
|
||||
{
|
||||
isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
|
||||
const char *type;
|
||||
const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
|
||||
const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
|
||||
"threadIdx.z" };
|
||||
|
||||
type = isl_options_get_ast_iterator_type(ctx);
|
||||
|
||||
print_iterators(out, type, kernel->block_ids, block_dims);
|
||||
print_iterators(out, type, kernel->thread_ids, thread_dims);
|
||||
}
|
||||
|
||||
static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel_var *var)
|
||||
{
|
||||
int j;
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
if (var->type == ppcg_access_shared)
|
||||
p = isl_printer_print_str(p, "__shared__ ");
|
||||
p = isl_printer_print_str(p, var->array->type);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
p = isl_printer_print_str(p, var->name);
|
||||
for (j = 0; j < var->array->n_index; ++j) {
|
||||
isl_val *v;
|
||||
|
||||
p = isl_printer_print_str(p, "[");
|
||||
v = isl_vec_get_element_val(var->size, j);
|
||||
p = isl_printer_print_val(p, v);
|
||||
isl_val_free(v);
|
||||
p = isl_printer_print_str(p, "]");
|
||||
}
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < kernel->n_var; ++i)
|
||||
p = print_kernel_var(p, &kernel->var[i]);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print a sync statement.
|
||||
*/
|
||||
static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel_stmt *stmt)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "__syncthreads();");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* This function is called for each user statement in the AST,
|
||||
* i.e., for each kernel body statement, copy statement or sync statement.
|
||||
*/
|
||||
static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
|
||||
__isl_take isl_ast_print_options *print_options,
|
||||
__isl_keep isl_ast_node *node, void *user)
|
||||
{
|
||||
isl_id *id;
|
||||
struct ppcg_kernel_stmt *stmt;
|
||||
|
||||
id = isl_ast_node_get_annotation(node);
|
||||
stmt = isl_id_get_user(id);
|
||||
isl_id_free(id);
|
||||
|
||||
isl_ast_print_options_free(print_options);
|
||||
|
||||
switch (stmt->type) {
|
||||
case ppcg_kernel_copy:
|
||||
return ppcg_kernel_print_copy(p, stmt);
|
||||
case ppcg_kernel_sync:
|
||||
return print_sync(p, stmt);
|
||||
case ppcg_kernel_domain:
|
||||
return ppcg_kernel_print_domain(p, stmt);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
|
||||
struct cuda_info *cuda)
|
||||
{
|
||||
isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
|
||||
isl_ast_print_options *print_options;
|
||||
isl_printer *p;
|
||||
|
||||
print_kernel_headers(prog, kernel, cuda);
|
||||
fprintf(cuda->kernel_c, "{\n");
|
||||
print_kernel_iterators(cuda->kernel_c, kernel);
|
||||
|
||||
p = isl_printer_to_file(ctx, cuda->kernel_c);
|
||||
p = isl_printer_set_output_format(p, ISL_FORMAT_C);
|
||||
p = isl_printer_indent(p, 4);
|
||||
|
||||
p = print_kernel_vars(p, kernel);
|
||||
p = isl_printer_end_line(p);
|
||||
p = ppcg_set_macro_names(p);
|
||||
p = gpu_print_macros(p, kernel->tree);
|
||||
|
||||
print_options = isl_ast_print_options_alloc(ctx);
|
||||
print_options = isl_ast_print_options_set_print_user(print_options,
|
||||
&print_kernel_stmt, NULL);
|
||||
p = isl_ast_node_print(kernel->tree, p, print_options);
|
||||
isl_printer_free(p);
|
||||
|
||||
fprintf(cuda->kernel_c, "}\n");
|
||||
}
|
||||
|
||||
/* Print code for initializing the device for execution of the transformed
|
||||
* code. This includes declaring locally defined variables as well as
|
||||
* declaring and allocating the required copies of arrays on the device.
|
||||
*/
|
||||
static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog)
|
||||
{
|
||||
p = print_cuda_macros(p);
|
||||
|
||||
p = gpu_print_local_declarations(p, prog);
|
||||
p = declare_device_arrays(p, prog);
|
||||
p = allocate_device_arrays(p, prog);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print code for clearing the device after execution of the transformed code.
|
||||
* In particular, free the memory that was allocated on the device.
|
||||
*/
|
||||
static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog)
|
||||
{
|
||||
p = free_device_arrays(p, prog);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print a statement for copying an array to or from the device,
|
||||
* or for initializing or clearing the device.
|
||||
* The statement identifier of a copying node is called
|
||||
* "to_device_<array name>" or "from_device_<array name>" and
|
||||
* its user pointer points to the gpu_array_info of the array
|
||||
* that needs to be copied.
|
||||
* The node for initializing the device is called "init_device".
|
||||
* The node for clearing the device is called "clear_device".
|
||||
*
|
||||
* Extract the array (if any) from the identifier and call
|
||||
* init_device, clear_device, copy_array_to_device or copy_array_from_device.
|
||||
*/
|
||||
static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
|
||||
__isl_keep isl_ast_node *node, struct gpu_prog *prog)
|
||||
{
|
||||
isl_ast_expr *expr, *arg;
|
||||
isl_id *id;
|
||||
const char *name;
|
||||
struct gpu_array_info *array;
|
||||
|
||||
expr = isl_ast_node_user_get_expr(node);
|
||||
arg = isl_ast_expr_get_op_arg(expr, 0);
|
||||
id = isl_ast_expr_get_id(arg);
|
||||
name = isl_id_get_name(id);
|
||||
array = isl_id_get_user(id);
|
||||
isl_id_free(id);
|
||||
isl_ast_expr_free(arg);
|
||||
isl_ast_expr_free(expr);
|
||||
|
||||
if (!name)
|
||||
return isl_printer_free(p);
|
||||
if (!strcmp(name, "init_device"))
|
||||
return init_device(p, prog);
|
||||
if (!strcmp(name, "clear_device"))
|
||||
return clear_device(p, prog);
|
||||
if (!array)
|
||||
return isl_printer_free(p);
|
||||
|
||||
if (!prefixcmp(name, "to_device"))
|
||||
return copy_array_to_device(p, array);
|
||||
else
|
||||
return copy_array_from_device(p, array);
|
||||
}
|
||||
|
||||
struct print_host_user_data {
|
||||
struct cuda_info *cuda;
|
||||
struct gpu_prog *prog;
|
||||
};
|
||||
|
||||
/* Print the user statement of the host code to "p".
|
||||
*
|
||||
* The host code may contain original user statements, kernel launches,
|
||||
* statements that copy data to/from the device and statements
|
||||
* the initialize or clear the device.
|
||||
* The original user statements and the kernel launches have
|
||||
* an associated annotation, while the other statements do not.
|
||||
* The latter are handled by print_device_node.
|
||||
* The annotation on the user statements is called "user".
|
||||
*
|
||||
* In case of a kernel launch, print a block of statements that
|
||||
* defines the grid and the block and then launches the kernel.
|
||||
*/
|
||||
__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
|
||||
__isl_take isl_ast_print_options *print_options,
|
||||
__isl_keep isl_ast_node *node, void *user)
|
||||
{
|
||||
isl_id *id;
|
||||
int is_user;
|
||||
struct ppcg_kernel *kernel;
|
||||
struct ppcg_kernel_stmt *stmt;
|
||||
struct print_host_user_data *data;
|
||||
|
||||
isl_ast_print_options_free(print_options);
|
||||
|
||||
data = (struct print_host_user_data *) user;
|
||||
|
||||
id = isl_ast_node_get_annotation(node);
|
||||
if (!id)
|
||||
return print_device_node(p, node, data->prog);
|
||||
|
||||
is_user = !strcmp(isl_id_get_name(id), "user");
|
||||
kernel = is_user ? NULL : isl_id_get_user(id);
|
||||
stmt = is_user ? isl_id_get_user(id) : NULL;
|
||||
isl_id_free(id);
|
||||
|
||||
if (is_user)
|
||||
return ppcg_kernel_print_domain(p, stmt);
|
||||
|
||||
p = ppcg_start_block(p);
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "dim3 k");
|
||||
p = isl_printer_print_int(p, kernel->id);
|
||||
p = isl_printer_print_str(p, "_dimBlock");
|
||||
p = print_reverse_list(p, kernel->n_block, kernel->block_dim);
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
p = print_grid(p, kernel);
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "kernel");
|
||||
p = isl_printer_print_int(p, kernel->id);
|
||||
p = isl_printer_print_str(p, " <<<k");
|
||||
p = isl_printer_print_int(p, kernel->id);
|
||||
p = isl_printer_print_str(p, "_dimGrid, k");
|
||||
p = isl_printer_print_int(p, kernel->id);
|
||||
p = isl_printer_print_str(p, "_dimBlock>>> (");
|
||||
p = print_kernel_arguments(p, data->prog, kernel, 0);
|
||||
p = isl_printer_print_str(p, ");");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "cudaCheckKernel();");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
p = ppcg_end_block(p);
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
#if 0
|
||||
print_kernel(data->prog, kernel, data->cuda);
|
||||
#endif
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
|
||||
struct cuda_info *cuda)
|
||||
{
|
||||
isl_ast_print_options *print_options;
|
||||
isl_ctx *ctx = isl_ast_node_get_ctx(tree);
|
||||
struct print_host_user_data data = { cuda, prog };
|
||||
|
||||
print_options = isl_ast_print_options_alloc(ctx);
|
||||
print_options = isl_ast_print_options_set_print_user(print_options,
|
||||
&print_host_user, &data);
|
||||
|
||||
p = gpu_print_macros(p, tree);
|
||||
p = isl_ast_node_print(tree, p, print_options);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Given a gpu_prog "prog" and the corresponding transformed AST
|
||||
* "tree", print the entire CUDA code to "p".
|
||||
* "types" collects the types for which a definition has already
|
||||
* been printed.
|
||||
*/
|
||||
static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
|
||||
struct gpu_types *types, void *user)
|
||||
{
|
||||
struct cuda_info *cuda = user;
|
||||
isl_printer *kernel;
|
||||
|
||||
kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
|
||||
kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
|
||||
kernel = gpu_print_types(kernel, types, prog);
|
||||
isl_printer_free(kernel);
|
||||
|
||||
if (!kernel)
|
||||
return isl_printer_free(p);
|
||||
|
||||
p = print_host_code(p, prog, tree, cuda);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Transform the code in the file called "input" by replacing
|
||||
* all scops by corresponding CUDA code.
|
||||
* The names of the output files are derived from "input".
|
||||
*
|
||||
* We let generate_gpu do all the hard work and then let it call
|
||||
* us back for printing the AST in print_cuda.
|
||||
*
|
||||
* To prepare for this printing, we first open the output files
|
||||
* and we close them after generate_gpu has finished.
|
||||
*/
|
||||
int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
|
||||
const char *input)
|
||||
{
|
||||
struct cuda_info cuda;
|
||||
int r;
|
||||
|
||||
cuda_open_files(&cuda, input);
|
||||
|
||||
r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
|
||||
|
||||
cuda_close_files(&cuda);
|
||||
|
||||
return r;
|
||||
}
|
||||
13
polly/lib/External/ppcg/cuda.h
vendored
13
polly/lib/External/ppcg/cuda.h
vendored
@ -1,13 +0,0 @@
|
||||
#ifndef _CUDA_H
|
||||
#define _CUDA_H
|
||||
|
||||
#include "ppcg_options.h"
|
||||
#include "ppcg.h"
|
||||
|
||||
int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
|
||||
const char *input);
|
||||
|
||||
__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
|
||||
__isl_take isl_ast_print_options *print_options,
|
||||
__isl_keep isl_ast_node *node, void *user);
|
||||
#endif
|
||||
50
polly/lib/External/ppcg/cuda_common.c
vendored
50
polly/lib/External/ppcg/cuda_common.c
vendored
@ -1,50 +0,0 @@
|
||||
/*
|
||||
* Copyright 2010 INRIA Saclay
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
|
||||
* Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
|
||||
* 91893 Orsay, France
|
||||
*/
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "cuda_common.h"
|
||||
#include "ppcg.h"
|
||||
|
||||
/* Open the host .cu file and the kernel .hu and .cu files for writing.
|
||||
* Add the necessary includes.
|
||||
*/
|
||||
void cuda_open_files(struct cuda_info *info, const char *input)
|
||||
{
|
||||
char name[PATH_MAX];
|
||||
int len;
|
||||
|
||||
len = ppcg_extract_base_name(name, input);
|
||||
|
||||
strcpy(name + len, "_host.cu");
|
||||
info->host_c = fopen(name, "w");
|
||||
|
||||
strcpy(name + len, "_kernel.cu");
|
||||
info->kernel_c = fopen(name, "w");
|
||||
|
||||
strcpy(name + len, "_kernel.hu");
|
||||
info->kernel_h = fopen(name, "w");
|
||||
fprintf(info->host_c, "#include <assert.h>\n");
|
||||
fprintf(info->host_c, "#include <stdio.h>\n");
|
||||
fprintf(info->host_c, "#include \"%s\"\n", name);
|
||||
fprintf(info->kernel_c, "#include \"%s\"\n", name);
|
||||
fprintf(info->kernel_h, "#include \"cuda.h\"\n\n");
|
||||
}
|
||||
|
||||
/* Close all output files.
|
||||
*/
|
||||
void cuda_close_files(struct cuda_info *info)
|
||||
{
|
||||
fclose(info->kernel_c);
|
||||
fclose(info->kernel_h);
|
||||
fclose(info->host_c);
|
||||
}
|
||||
15
polly/lib/External/ppcg/cuda_common.h
vendored
15
polly/lib/External/ppcg/cuda_common.h
vendored
@ -1,15 +0,0 @@
|
||||
#ifndef _CUDA_COMMON_H_
|
||||
#define _CUDA_COMMON_H_
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
struct cuda_info {
|
||||
FILE *host_c;
|
||||
FILE *kernel_c;
|
||||
FILE *kernel_h;
|
||||
};
|
||||
|
||||
void cuda_open_files(struct cuda_info *info, const char *input);
|
||||
void cuda_close_files(struct cuda_info *info);
|
||||
|
||||
#endif
|
||||
192
polly/lib/External/ppcg/external.c
vendored
192
polly/lib/External/ppcg/external.c
vendored
@ -1,192 +0,0 @@
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <pet.h>
|
||||
#include "cpu.h"
|
||||
#include "opencl.h"
|
||||
|
||||
|
||||
#define die() { \
|
||||
fprintf(stderr, "Dummy function %s called\n", __FUNCTION__); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
__isl_give isl_union_map *pet_scop_compute_outer_to_any(
|
||||
__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
|
||||
__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree) {
|
||||
die();
|
||||
}
|
||||
int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
|
||||
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
|
||||
die();
|
||||
}
|
||||
isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
|
||||
__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
|
||||
__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_expr_access_get_must_write(
|
||||
__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
|
||||
__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
|
||||
struct ppcg_scop *ps, struct ppcg_options *options) {
|
||||
die();
|
||||
}
|
||||
|
||||
__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
|
||||
__isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr) {
|
||||
die();
|
||||
}
|
||||
unsigned pet_loc_get_start(__isl_keep pet_loc *loc) {
|
||||
die();
|
||||
}
|
||||
unsigned pet_loc_get_end(__isl_keep pet_loc *loc) {
|
||||
die();
|
||||
}
|
||||
int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
|
||||
__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
|
||||
__isl_take pet_scop *scop, void *user), void *user) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
|
||||
__isl_take isl_printer *p) {
|
||||
die();
|
||||
}
|
||||
__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
|
||||
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
|
||||
die();
|
||||
}
|
||||
int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
|
||||
int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
|
||||
die();
|
||||
}
|
||||
int pet_stmt_is_kill(struct pet_stmt *stmt) {
|
||||
die();
|
||||
}
|
||||
struct isl_args pet_options_args;
|
||||
const char *ppcg_version(void) {
|
||||
die();
|
||||
}
|
||||
int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val) {
|
||||
die();
|
||||
}
|
||||
int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
|
||||
const char *input, const char *output) {
|
||||
die();
|
||||
}
|
||||
int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
|
||||
const char *input, const char *output) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
|
||||
__isl_keep isl_ast_build *build,
|
||||
__isl_give isl_multi_pw_aff *(*fn_index)(
|
||||
__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
|
||||
void *user), void *user_index,
|
||||
__isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
|
||||
__isl_keep isl_id *id, void *user), void *user_expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
|
||||
__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
|
||||
__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
|
||||
__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
|
||||
__isl_keep pet_scop *scop) {
|
||||
die();
|
||||
}
|
||||
__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
|
||||
__isl_keep const char *name) {
|
||||
die();
|
||||
}
|
||||
__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos) {
|
||||
die();
|
||||
}
|
||||
__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
|
||||
__isl_take pet_expr *arg) {
|
||||
die();
|
||||
}
|
||||
__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
|
||||
__isl_take pet_expr *arg) {
|
||||
die();
|
||||
}
|
||||
__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree) {
|
||||
die();
|
||||
}
|
||||
__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree) {
|
||||
die();
|
||||
}
|
||||
__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
|
||||
__isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
|
||||
void *user) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_expr_access_get_may_read(
|
||||
__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
__isl_give isl_union_map *pet_expr_access_get_may_write(
|
||||
__isl_keep pet_expr *expr) {
|
||||
die();
|
||||
}
|
||||
5849
polly/lib/External/ppcg/gpu.c
vendored
5849
polly/lib/External/ppcg/gpu.c
vendored
File diff suppressed because it is too large
Load Diff
459
polly/lib/External/ppcg/gpu.h
vendored
459
polly/lib/External/ppcg/gpu.h
vendored
@ -1,459 +0,0 @@
|
||||
#ifndef _GPU_H
|
||||
#define _GPU_H
|
||||
|
||||
#include <isl/ast.h>
|
||||
#include <isl/id.h>
|
||||
#include <isl/id_to_ast_expr.h>
|
||||
|
||||
#include <pet.h>
|
||||
|
||||
#include "ppcg.h"
|
||||
#include "ppcg_options.h"
|
||||
|
||||
/* An access to an outer array element or an iterator.
|
||||
* Accesses to iterators have an access relation that maps to an unnamed space.
|
||||
* An access may be both read and write.
|
||||
* If the access relation is empty, then the output dimension may
|
||||
* not be equal to the dimension of the corresponding array.
|
||||
*/
|
||||
struct gpu_stmt_access {
|
||||
/* Access reads elements */
|
||||
int read;
|
||||
/* Access writes elements */
|
||||
int write;
|
||||
/* All writes are definite writes. */
|
||||
int exact_write;
|
||||
/* Is a single, fixed element being accessed? */
|
||||
isl_bool fixed_element;
|
||||
/* The number of index expressions specified in the access. */
|
||||
int n_index;
|
||||
|
||||
/* May access relation */
|
||||
isl_map *access;
|
||||
/* May access relation with as domain a mapping from iteration domain
|
||||
* to a reference identifier.
|
||||
*/
|
||||
isl_map *tagged_access;
|
||||
/* The reference id of the corresponding pet_expr. */
|
||||
isl_id *ref_id;
|
||||
|
||||
struct gpu_stmt_access *next;
|
||||
};
|
||||
|
||||
/* A representation of a user statement.
|
||||
* "stmt" points to the corresponding pet statement.
|
||||
* "id" is the identifier of the instance set of the statement.
|
||||
* "accesses" is a linked list of accesses performed by the statement.
|
||||
* If the statement has been killed, i.e., if it will not be scheduled,
|
||||
* then this linked list may be empty even if the actual statement does
|
||||
* perform accesses.
|
||||
*/
|
||||
struct gpu_stmt {
|
||||
isl_id *id;
|
||||
struct pet_stmt *stmt;
|
||||
|
||||
struct gpu_stmt_access *accesses;
|
||||
};
|
||||
|
||||
/* Represents an outer array possibly accessed by a gpu_prog.
|
||||
*/
|
||||
struct gpu_array_info {
|
||||
/* The array data space. */
|
||||
isl_space *space;
|
||||
/* Element type. */
|
||||
char *type;
|
||||
/* Element size. */
|
||||
int size;
|
||||
/* Name of the array. */
|
||||
char *name;
|
||||
/* Declared extent of original array. */
|
||||
isl_set *declared_extent;
|
||||
/* AST expression for declared size of original array. */
|
||||
isl_ast_expr *declared_size;
|
||||
/* Extent of the array that needs to be copied. */
|
||||
isl_set *extent;
|
||||
/* Number of indices. */
|
||||
unsigned n_index;
|
||||
/* For each index, a bound on "extent" in that direction. */
|
||||
isl_multi_pw_aff *bound;
|
||||
/* The corresponding access AST expression, if the array needs
|
||||
* to be allocated on the device.
|
||||
*/
|
||||
isl_ast_expr *bound_expr;
|
||||
|
||||
/* All references to this array; point to elements of a linked list. */
|
||||
int n_ref;
|
||||
struct gpu_stmt_access **refs;
|
||||
|
||||
/* Is this array accessed at all by the program? */
|
||||
int accessed;
|
||||
|
||||
/* Is this a scalar that is read-only within the entire program? */
|
||||
int read_only_scalar;
|
||||
|
||||
/* Are the elements of the array structures? */
|
||||
int has_compound_element;
|
||||
|
||||
/* Are the elements only accessed through constant index expressions? */
|
||||
int only_fixed_element;
|
||||
|
||||
/* Is the array local to the scop? */
|
||||
int local;
|
||||
/* Is the array local and should it be declared on the host? */
|
||||
int declare_local;
|
||||
|
||||
/* Is the corresponding global device memory accessed in any way? */
|
||||
int global;
|
||||
|
||||
/* Should the array be linearized? */
|
||||
int linearize;
|
||||
|
||||
/* Order dependences on this array.
|
||||
* Only used if live_range_reordering option is set.
|
||||
* It is set to NULL otherwise.
|
||||
*/
|
||||
isl_union_map *dep_order;
|
||||
|
||||
void *user;
|
||||
};
|
||||
|
||||
/* Represents an outer array accessed by a ppcg_kernel, localized
|
||||
* to the context of this kernel.
|
||||
*
|
||||
* "array" points to the corresponding array in the gpu_prog.
|
||||
* The "n_group" "groups" are the reference groups associated to the array.
|
||||
* If "force_private" is set, then the array (in practice a scalar)
|
||||
* must be mapped to a register.
|
||||
* "global" is set if the global device memory corresponding
|
||||
* to this array is accessed by the kernel.
|
||||
* "bound" is equal to array->bound specialized to the current kernel.
|
||||
* "bound_expr" is the corresponding access AST expression.
|
||||
*/
|
||||
struct gpu_local_array_info {
|
||||
struct gpu_array_info *array;
|
||||
|
||||
int n_group;
|
||||
struct gpu_array_ref_group **groups;
|
||||
|
||||
int force_private;
|
||||
int global;
|
||||
|
||||
unsigned n_index;
|
||||
isl_multi_pw_aff *bound;
|
||||
isl_ast_expr *bound_expr;
|
||||
};
|
||||
|
||||
__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
|
||||
struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);
|
||||
|
||||
/* A sequence of "n" names of types.
|
||||
*/
|
||||
struct gpu_types {
|
||||
int n;
|
||||
char **name;
|
||||
};
|
||||
|
||||
/* "read" and "write" contain the original access relations, possibly
|
||||
* involving member accesses.
|
||||
*
|
||||
* The elements of "array", as well as the ranges of "copy_in" and "copy_out"
|
||||
* only refer to the outer arrays of any possible member accesses.
|
||||
*/
|
||||
struct gpu_prog {
|
||||
isl_ctx *ctx;
|
||||
|
||||
struct ppcg_scop *scop;
|
||||
|
||||
/* Set of parameter values */
|
||||
isl_set *context;
|
||||
|
||||
/* All potential read accesses in the entire program */
|
||||
isl_union_map *read;
|
||||
|
||||
/* All potential write accesses in the entire program */
|
||||
isl_union_map *may_write;
|
||||
/* All definite write accesses in the entire program */
|
||||
isl_union_map *must_write;
|
||||
/* All tagged definite kills in the entire program */
|
||||
isl_union_map *tagged_must_kill;
|
||||
|
||||
/* The set of inner array elements that may be preserved. */
|
||||
isl_union_set *may_persist;
|
||||
|
||||
/* A mapping from all innermost arrays to their outer arrays. */
|
||||
isl_union_map *to_outer;
|
||||
/* A mapping from the outer arrays to all corresponding inner arrays. */
|
||||
isl_union_map *to_inner;
|
||||
/* A mapping from all intermediate arrays to their outer arrays,
|
||||
* including an identity mapping from the anonymous 1D space to itself.
|
||||
*/
|
||||
isl_union_map *any_to_outer;
|
||||
|
||||
/* Order dependences on non-scalars. */
|
||||
isl_union_map *array_order;
|
||||
|
||||
/* Array of statements */
|
||||
int n_stmts;
|
||||
struct gpu_stmt *stmts;
|
||||
|
||||
int n_array;
|
||||
struct gpu_array_info *array;
|
||||
};
|
||||
|
||||
struct gpu_gen {
|
||||
isl_ctx *ctx;
|
||||
struct ppcg_options *options;
|
||||
|
||||
/* Callback for printing of AST in appropriate format. */
|
||||
__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
|
||||
struct gpu_types *types, void *user);
|
||||
void *print_user;
|
||||
|
||||
isl_id_to_ast_expr *(*build_ast_expr)(void *stmt,
|
||||
isl_ast_build *build,
|
||||
isl_multi_pw_aff *(*fn_index)(
|
||||
__isl_take isl_multi_pw_aff *mpa, isl_id *id,
|
||||
void *user),
|
||||
void *user_index,
|
||||
isl_ast_expr *(*fn_expr)(isl_ast_expr *expr,
|
||||
isl_id *id, void *user),
|
||||
void *user_expr);
|
||||
|
||||
struct gpu_prog *prog;
|
||||
/* The generated AST. */
|
||||
isl_ast_node *tree;
|
||||
|
||||
/* The sequence of types for which a definition has been printed. */
|
||||
struct gpu_types types;
|
||||
|
||||
/* User specified tile, grid and block sizes for each kernel */
|
||||
isl_union_map *sizes;
|
||||
|
||||
/* Effectively used tile, grid and block sizes for each kernel */
|
||||
isl_union_map *used_sizes;
|
||||
|
||||
/* Identifier of the next kernel. */
|
||||
int kernel_id;
|
||||
};
|
||||
|
||||
enum ppcg_group_access_type {
|
||||
ppcg_access_global,
|
||||
ppcg_access_shared,
|
||||
ppcg_access_private
|
||||
};
|
||||
|
||||
enum ppcg_kernel_stmt_type {
|
||||
ppcg_kernel_copy,
|
||||
ppcg_kernel_domain,
|
||||
ppcg_kernel_sync
|
||||
};
|
||||
|
||||
/* Representation of special statements, in particular copy statements
|
||||
* and __syncthreads statements, inside a kernel.
|
||||
*
|
||||
* type represents the kind of statement
|
||||
*
|
||||
*
|
||||
* for ppcg_kernel_copy statements we have
|
||||
*
|
||||
* read is set if the statement should copy data from global memory
|
||||
* to shared memory or registers.
|
||||
*
|
||||
* index expresses an access to the array element that needs to be copied
|
||||
* local_index expresses the corresponding element in the tile
|
||||
*
|
||||
* array refers to the original array being copied
|
||||
* local_array is a pointer to the appropriate element in the "array"
|
||||
* array of the ppcg_kernel to which this copy access belongs
|
||||
*
|
||||
*
|
||||
* for ppcg_kernel_domain statements we have
|
||||
*
|
||||
* stmt is the corresponding input statement
|
||||
*
|
||||
* n_access is the number of accesses in stmt
|
||||
* access is an array of local information about the accesses
|
||||
*/
|
||||
struct ppcg_kernel_stmt {
|
||||
enum ppcg_kernel_stmt_type type;
|
||||
|
||||
union {
|
||||
struct {
|
||||
int read;
|
||||
isl_ast_expr *index;
|
||||
isl_ast_expr *local_index;
|
||||
struct gpu_array_info *array;
|
||||
struct gpu_local_array_info *local_array;
|
||||
} c;
|
||||
struct {
|
||||
struct gpu_stmt *stmt;
|
||||
isl_id_to_ast_expr *ref2expr;
|
||||
} d;
|
||||
} u;
|
||||
};
|
||||
|
||||
/* Representation of a local variable in a kernel.
|
||||
*/
|
||||
struct ppcg_kernel_var {
|
||||
struct gpu_array_info *array;
|
||||
enum ppcg_group_access_type type;
|
||||
char *name;
|
||||
isl_vec *size;
|
||||
};
|
||||
|
||||
/* Representation of a kernel.
|
||||
*
|
||||
* prog describes the original code from which the kernel is extracted.
|
||||
*
|
||||
* id is the sequence number of the kernel.
|
||||
*
|
||||
* block_ids contains the list of block identifiers for this kernel.
|
||||
* thread_ids contains the list of thread identifiers for this kernel.
|
||||
*
|
||||
* the first n_grid elements of grid_dim represent the specified size
|
||||
* of the grid.
|
||||
* the first n_block elements of block_dim represent the specified or
|
||||
* effective size of the block.
|
||||
* Note that in the input file, the sizes of the grid and the blocks
|
||||
* are specified in the order x, y, z, but internally, the sizes
|
||||
* are stored in reverse order, so that the last element always
|
||||
* refers to the x dimension.
|
||||
*
|
||||
* grid_size reflects the effective grid size.
|
||||
* grid_size_expr contains a corresponding access AST expression, built within
|
||||
* the context where the launch appears.
|
||||
*
|
||||
* context contains the values of the parameters and outer schedule dimensions
|
||||
* for which any statement instance in this kernel needs to be executed.
|
||||
*
|
||||
* n_sync is the number of synchronization operations that have
|
||||
* been introduced in the schedule tree corresponding to this kernel (so far).
|
||||
*
|
||||
* core contains the spaces of the statement domains that form
|
||||
* the core computation of the kernel. It is used to navigate
|
||||
* the tree during the construction of the device part of the schedule
|
||||
* tree in gpu_create_kernel.
|
||||
*
|
||||
* expanded_domain contains the original statement instances,
|
||||
* i.e., those that appear in the domains of access relations,
|
||||
* that are involved in the kernel.
|
||||
* contraction maps those original statement instances to
|
||||
* the statement instances that are active at the point
|
||||
* in the schedule tree where the kernel is created.
|
||||
*
|
||||
* arrays is the set of possibly accessed outer array elements.
|
||||
*
|
||||
* space is the schedule space of the AST context. That is, it represents
|
||||
* the loops of the generated host code containing the kernel launch.
|
||||
*
|
||||
* n_array is the total number of arrays in the input program and also
|
||||
* the number of element in the array array.
|
||||
* array contains information about each array that is local
|
||||
* to the current kernel. If an array is not used in a kernel,
|
||||
* then the corresponding entry does not contain any information.
|
||||
*
|
||||
* any_force_private is set if any array in the kernel is marked force_private
|
||||
*
|
||||
* block_filter contains constraints on the domain elements in the kernel
|
||||
* that encode the mapping to block identifiers, where the block identifiers
|
||||
* are represented by "n_grid" parameters with as names the elements
|
||||
* of "block_ids".
|
||||
*
|
||||
* thread_filter contains constraints on the domain elements in the kernel
|
||||
* that encode the mapping to thread identifiers, where the thread identifiers
|
||||
* are represented by "n_block" parameters with as names the elements
|
||||
* of "thread_ids".
|
||||
*
|
||||
* copy_schedule corresponds to the schedule dimensions of
|
||||
* the (tiled) schedule for this kernel that have been taken into account
|
||||
* for computing private/shared memory tiles.
|
||||
* The domain corresponds to the original statement instances, i.e.,
|
||||
* those that appear in the leaves of the schedule tree.
|
||||
* copy_schedule_dim is the dimension of this schedule.
|
||||
*
|
||||
* sync_writes contains write references that require synchronization.
|
||||
* Each reference is represented by a universe set in a space [S[i,j] -> R[]]
|
||||
* with S[i,j] the statement instance space and R[] the array reference.
|
||||
*/
|
||||
struct ppcg_kernel {
|
||||
isl_ctx *ctx;
|
||||
struct ppcg_options *options;
|
||||
|
||||
struct gpu_prog *prog;
|
||||
|
||||
int id;
|
||||
|
||||
isl_id_list *block_ids;
|
||||
isl_id_list *thread_ids;
|
||||
|
||||
int n_grid;
|
||||
int n_block;
|
||||
int grid_dim[2];
|
||||
int block_dim[3];
|
||||
|
||||
isl_multi_pw_aff *grid_size;
|
||||
isl_ast_expr *grid_size_expr;
|
||||
isl_set *context;
|
||||
|
||||
int n_sync;
|
||||
isl_union_set *core;
|
||||
isl_union_set *arrays;
|
||||
|
||||
isl_union_pw_multi_aff *contraction;
|
||||
isl_union_set *expanded_domain;
|
||||
|
||||
isl_space *space;
|
||||
|
||||
int n_array;
|
||||
struct gpu_local_array_info *array;
|
||||
|
||||
int n_var;
|
||||
struct ppcg_kernel_var *var;
|
||||
|
||||
int any_force_private;
|
||||
|
||||
isl_union_set *block_filter;
|
||||
isl_union_set *thread_filter;
|
||||
isl_union_pw_multi_aff *copy_schedule;
|
||||
int copy_schedule_dim;
|
||||
|
||||
isl_union_set *sync_writes;
|
||||
|
||||
isl_ast_node *tree;
|
||||
};
|
||||
|
||||
int gpu_array_is_scalar(struct gpu_array_info *array);
|
||||
int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
|
||||
int gpu_array_requires_device_allocation(struct gpu_array_info *array);
|
||||
__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
|
||||
isl_bool gpu_array_can_be_private(struct gpu_array_info *array);
|
||||
|
||||
struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
|
||||
void *gpu_prog_free(struct gpu_prog *prog);
|
||||
|
||||
int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);
|
||||
|
||||
int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
|
||||
struct ppcg_options *options,
|
||||
__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
|
||||
struct gpu_types *types, void *user), void *user);
|
||||
|
||||
__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
|
||||
__isl_take isl_schedule_node *node, int scale,
|
||||
__isl_keep isl_multi_val *sizes);
|
||||
|
||||
__isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
|
||||
int has_any_permutable_node(__isl_keep isl_schedule *schedule);
|
||||
__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
|
||||
__isl_take isl_schedule *schedule,
|
||||
int to_from_device);
|
||||
__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
|
||||
__isl_take isl_schedule *schedule);
|
||||
|
||||
__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
|
||||
void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
|
||||
void collect_order_dependences(struct gpu_prog *prog);
|
||||
isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
|
||||
#endif
|
||||
71
polly/lib/External/ppcg/gpu_array_tile.c
vendored
71
polly/lib/External/ppcg/gpu_array_tile.c
vendored
@ -1,71 +0,0 @@
|
||||
#include <isl/aff.h>
|
||||
#include <isl/map.h>
|
||||
|
||||
#include "gpu_array_tile.h"
|
||||
|
||||
struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile)
|
||||
{
|
||||
int j;
|
||||
|
||||
if (!tile)
|
||||
return NULL;
|
||||
|
||||
for (j = 0; j < tile->n; ++j) {
|
||||
isl_val_free(tile->bound[j].size);
|
||||
isl_val_free(tile->bound[j].stride);
|
||||
isl_aff_free(tile->bound[j].lb);
|
||||
isl_aff_free(tile->bound[j].shift);
|
||||
}
|
||||
free(tile->bound);
|
||||
isl_multi_aff_free(tile->tiling);
|
||||
free(tile);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Create a gpu_array_tile for an array of dimension "n_index".
|
||||
*/
|
||||
struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index)
|
||||
{
|
||||
int i;
|
||||
struct gpu_array_tile *tile;
|
||||
|
||||
tile = isl_calloc_type(ctx, struct gpu_array_tile);
|
||||
if (!tile)
|
||||
return NULL;
|
||||
|
||||
tile->ctx = ctx;
|
||||
tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
|
||||
if (!tile->bound)
|
||||
return gpu_array_tile_free(tile);
|
||||
|
||||
tile->n = n_index;
|
||||
|
||||
for (i = 0; i < n_index; ++i) {
|
||||
tile->bound[i].size = NULL;
|
||||
tile->bound[i].lb = NULL;
|
||||
tile->bound[i].stride = NULL;
|
||||
tile->bound[i].shift = NULL;
|
||||
}
|
||||
|
||||
return tile;
|
||||
}
|
||||
|
||||
/* Compute the size of the tile specified by "tile"
|
||||
* in number of elements and return the result.
|
||||
*/
|
||||
__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile)
|
||||
{
|
||||
int i;
|
||||
isl_val *size;
|
||||
|
||||
if (!tile)
|
||||
return NULL;
|
||||
|
||||
size = isl_val_one(tile->ctx);
|
||||
|
||||
for (i = 0; i < tile->n; ++i)
|
||||
size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
|
||||
|
||||
return size;
|
||||
}
|
||||
59
polly/lib/External/ppcg/gpu_array_tile.h
vendored
59
polly/lib/External/ppcg/gpu_array_tile.h
vendored
@ -1,59 +0,0 @@
|
||||
#ifndef GPU_ARRAY_TILE_H
|
||||
#define GPU_ARRAY_TILE_H
|
||||
|
||||
#include <isl/aff_type.h>
|
||||
#include <isl/map_type.h>
|
||||
#include <isl/val.h>
|
||||
|
||||
/* The fields stride and shift only contain valid information
|
||||
* if shift != NULL.
|
||||
* If so, they express that current index is such that if you add shift,
|
||||
* then the result is always a multiple of stride.
|
||||
* Let D represent the initial tile->depth dimensions of the computed schedule.
|
||||
* The spaces of "lb" and "shift" are of the form
|
||||
*
|
||||
* D -> [b]
|
||||
*/
|
||||
struct gpu_array_bound {
|
||||
isl_val *size;
|
||||
isl_aff *lb;
|
||||
|
||||
isl_val *stride;
|
||||
isl_aff *shift;
|
||||
};
|
||||
|
||||
/* A tile of an outer array.
|
||||
*
|
||||
* requires_unroll is set if the schedule dimensions that are mapped
|
||||
* to threads need to be unrolled for this (private) tile to be used.
|
||||
*
|
||||
* "depth" reflects the number of schedule dimensions that affect the tile.
|
||||
* The copying into and/or out of the tile is performed at that depth.
|
||||
*
|
||||
* n is the dimension of the array.
|
||||
* bound is an array of size "n" representing the lower bound
|
||||
* and size for each index.
|
||||
*
|
||||
* tiling maps a tile in the global array to the corresponding
|
||||
* shared/private memory tile and is of the form
|
||||
*
|
||||
* { [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
|
||||
*
|
||||
* where D represents the initial "depth" dimensions
|
||||
* of the computed schedule.
|
||||
*/
|
||||
struct gpu_array_tile {
|
||||
isl_ctx *ctx;
|
||||
int requires_unroll;
|
||||
int depth;
|
||||
int n;
|
||||
struct gpu_array_bound *bound;
|
||||
isl_multi_aff *tiling;
|
||||
};
|
||||
|
||||
struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index);
|
||||
struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile);
|
||||
|
||||
__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile);
|
||||
|
||||
#endif
|
||||
1828
polly/lib/External/ppcg/gpu_group.c
vendored
1828
polly/lib/External/ppcg/gpu_group.c
vendored
File diff suppressed because it is too large
Load Diff
65
polly/lib/External/ppcg/gpu_group.h
vendored
65
polly/lib/External/ppcg/gpu_group.h
vendored
@ -1,65 +0,0 @@
|
||||
#ifndef GPU_GROUP_H
|
||||
#define GPU_GROUP_H
|
||||
|
||||
#include <isl/schedule_node.h>
|
||||
#include "gpu.h"
|
||||
|
||||
/* A group of array references in a kernel that should be handled together.
|
||||
* If private_tile is not NULL, then it is mapped to registers.
|
||||
* Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
|
||||
* Otherwise, it is accessed from global memory.
|
||||
* Note that if both private_tile and shared_tile are set, then shared_tile
|
||||
* is only used inside group_common_shared_memory_tile.
|
||||
*/
|
||||
struct gpu_array_ref_group {
|
||||
/* The references in this group access this local array. */
|
||||
struct gpu_local_array_info *local_array;
|
||||
/* This is the corresponding array. */
|
||||
struct gpu_array_info *array;
|
||||
/* Position of this group in the list of reference groups of array. */
|
||||
int nr;
|
||||
|
||||
/* The following fields are use during the construction of the groups.
|
||||
* access is the combined access relation relative to the private
|
||||
* memory tiling. In particular, the domain of the map corresponds
|
||||
* to the first thread_depth dimensions of the kernel schedule.
|
||||
* write is set if any access in the group is a write.
|
||||
* exact_write is set if all writes are definite writes.
|
||||
* slice is set if there is at least one access in the group
|
||||
* that refers to more than one element
|
||||
* "min_depth" is the minimum of the tile depths and thread_depth.
|
||||
*/
|
||||
isl_map *access;
|
||||
int write;
|
||||
int exact_write;
|
||||
int slice;
|
||||
int min_depth;
|
||||
|
||||
/* The shared memory tile, NULL if none. */
|
||||
struct gpu_array_tile *shared_tile;
|
||||
|
||||
/* The private memory tile, NULL if none. */
|
||||
struct gpu_array_tile *private_tile;
|
||||
|
||||
/* References in this group; point to elements of a linked list. */
|
||||
int n_ref;
|
||||
struct gpu_stmt_access **refs;
|
||||
};
|
||||
|
||||
int gpu_group_references(struct ppcg_kernel *kernel,
|
||||
__isl_keep isl_schedule_node *node);
|
||||
|
||||
__isl_give isl_printer *gpu_array_ref_group_print_name(
|
||||
struct gpu_array_ref_group *group, __isl_take isl_printer *p);
|
||||
void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group);
|
||||
__isl_give isl_union_map *gpu_array_ref_group_access_relation(
|
||||
struct gpu_array_ref_group *group, int read, int write);
|
||||
int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group);
|
||||
enum ppcg_group_access_type gpu_array_ref_group_type(
|
||||
struct gpu_array_ref_group *group);
|
||||
struct gpu_array_tile *gpu_array_ref_group_tile(
|
||||
struct gpu_array_ref_group *group);
|
||||
struct gpu_array_ref_group *gpu_array_ref_group_free(
|
||||
struct gpu_array_ref_group *group);
|
||||
|
||||
#endif
|
||||
146
polly/lib/External/ppcg/gpu_hybrid.c
vendored
146
polly/lib/External/ppcg/gpu_hybrid.c
vendored
@ -1,146 +0,0 @@
|
||||
/*
|
||||
* Copyright 2013 Ecole Normale Superieure
|
||||
* Copyright 2015 Sven Verdoolaege
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege,
|
||||
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <isl/val.h>
|
||||
#include <isl/space.h>
|
||||
#include <isl/union_set.h>
|
||||
#include <isl/schedule_node.h>
|
||||
|
||||
#include "hybrid.h"
|
||||
#include "gpu_hybrid.h"
|
||||
#include "gpu_tree.h"
|
||||
#include "schedule.h"
|
||||
#include "util.h"
|
||||
|
||||
/* Have all domain elements been filtered out before reaching
|
||||
* the "node" position in the schedule tree?
|
||||
*/
|
||||
static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node)
|
||||
{
|
||||
isl_union_set *domain;
|
||||
isl_bool empty;
|
||||
|
||||
domain = isl_schedule_node_get_domain(node);
|
||||
empty = isl_union_set_is_empty(domain);
|
||||
isl_union_set_free(domain);
|
||||
|
||||
return empty;
|
||||
}
|
||||
|
||||
/* Given a pointer to a phase in the result of hybrid tiling,
|
||||
* map the phase to the device, provided the phase is non-empty.
|
||||
* Empty phases can occur if the input schedule domain can be
|
||||
* covered by a small number of hexagons that all belong to the same phase.
|
||||
*
|
||||
* The input has the following form:
|
||||
*
|
||||
* M - CT - P - C - ...
|
||||
*
|
||||
* with M the phase marker, CT the space tiling, P the original
|
||||
* parent band and C the original child band.
|
||||
* The (outer dimensions of the) C band need to be mapped to threads.
|
||||
* The (outer dimension of the) CT band needs to be mapped to blocks.
|
||||
* The mapping to shared memory needs to be computed between the CT and
|
||||
* the P band.
|
||||
*
|
||||
* The C band is first shifted to start at zero.
|
||||
* Then the appropriate markers are introduced and a kernel is
|
||||
* created for the tree rooted at CT.
|
||||
* If the "unroll_gpu_tile" option is set, then the AST generator
|
||||
* is instructed to unroll the P and C bands.
|
||||
*/
|
||||
static __isl_give isl_schedule_node *update_phase(
|
||||
__isl_take isl_schedule_node *node, void *user)
|
||||
{
|
||||
struct gpu_gen *gen = user;
|
||||
int depth0, depth;
|
||||
isl_ctx *ctx;
|
||||
isl_id *id;
|
||||
isl_bool empty_domain;
|
||||
ppcg_ht_phase *phase;
|
||||
|
||||
empty_domain = has_empty_domain(node);
|
||||
if (empty_domain < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (empty_domain)
|
||||
return node;
|
||||
|
||||
if (!node)
|
||||
return NULL;
|
||||
ctx = isl_schedule_node_get_ctx(node);
|
||||
|
||||
phase = ppcg_ht_phase_extract_from_mark(node);
|
||||
|
||||
depth0 = isl_schedule_node_get_tree_depth(node);
|
||||
|
||||
node = isl_schedule_node_child(node, 0);
|
||||
|
||||
node = isl_schedule_node_child(node, 0);
|
||||
node = isl_schedule_node_child(node, 0);
|
||||
node = ppcg_ht_phase_shift_space_point(phase, node);
|
||||
if (gen->options->unroll_gpu_tile)
|
||||
node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
|
||||
id = isl_id_alloc(ctx, "thread", NULL);
|
||||
node = isl_schedule_node_insert_mark(node, id);
|
||||
node = isl_schedule_node_parent(node);
|
||||
if (gen->options->unroll_gpu_tile)
|
||||
node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
|
||||
id = isl_id_alloc(ctx, "shared", NULL);
|
||||
node = isl_schedule_node_insert_mark(node, id);
|
||||
node = isl_schedule_node_parent(node);
|
||||
|
||||
node = gpu_create_kernel(gen, node, 0, NULL);
|
||||
|
||||
depth = isl_schedule_node_get_tree_depth(node);
|
||||
node = isl_schedule_node_ancestor(node, depth - depth0);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Apply hybrid tiling on "node" and its parent based on the (valid)
|
||||
* bounds on the relative dependence distances "bounds" and
|
||||
* the tile sizes in "tile_sizes".
|
||||
* The number of elements in "tile_sizes" is at least as large
|
||||
* as the sum of the dimensions of the parent and the child node.
|
||||
*
|
||||
* Convert the tile_sizes to an isl_multi_val in the right space,
|
||||
* insert the hybrid tiling and then create a kernel inside each phase.
|
||||
* Finally, remove the phase marks.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
|
||||
__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
|
||||
int *tile_sizes)
|
||||
{
|
||||
isl_multi_val *mv;
|
||||
isl_space *space, *space2;
|
||||
|
||||
if (!node || !bounds)
|
||||
goto error;
|
||||
|
||||
space2 = isl_schedule_node_band_get_space(node);
|
||||
node = isl_schedule_node_parent(node);
|
||||
space = isl_schedule_node_band_get_space(node);
|
||||
space = isl_space_product(space, space2);
|
||||
mv = ppcg_multi_val_from_int_list(space, tile_sizes);
|
||||
|
||||
node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options);
|
||||
|
||||
node = hybrid_tile_foreach_phase(node, &update_phase, gen);
|
||||
|
||||
node = hybrid_tile_drop_phase_marks(node);
|
||||
|
||||
return node;
|
||||
error:
|
||||
isl_schedule_node_free(node);
|
||||
ppcg_ht_bounds_free(bounds);
|
||||
return NULL;
|
||||
}
|
||||
13
polly/lib/External/ppcg/gpu_hybrid.h
vendored
13
polly/lib/External/ppcg/gpu_hybrid.h
vendored
@ -1,13 +0,0 @@
|
||||
#ifndef GPU_HYBRID_H
|
||||
#define GPU_HYBRID_H
|
||||
|
||||
#include <isl/schedule_node.h>
|
||||
|
||||
#include "gpu.h"
|
||||
#include "hybrid.h"
|
||||
|
||||
__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
|
||||
__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
|
||||
int *tile_sizes);
|
||||
|
||||
#endif
|
||||
310
polly/lib/External/ppcg/gpu_print.c
vendored
310
polly/lib/External/ppcg/gpu_print.c
vendored
@ -1,310 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012 Ecole Normale Superieure
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege,
|
||||
* Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <isl/aff.h>
|
||||
|
||||
#include "gpu_print.h"
|
||||
#include "print.h"
|
||||
#include "schedule.h"
|
||||
|
||||
/* Print declarations to "p" for arrays that are local to "prog"
|
||||
* but that are used on the host and therefore require a declaration.
|
||||
*/
|
||||
__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!prog)
|
||||
return isl_printer_free(p);
|
||||
|
||||
for (i = 0; i < prog->n_array; ++i) {
|
||||
struct gpu_array_info *array = &prog->array[i];
|
||||
isl_ast_expr *size;
|
||||
|
||||
if (!array->declare_local)
|
||||
continue;
|
||||
size = array->declared_size;
|
||||
p = ppcg_print_declaration_with_size(p, array->type, size);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print an expression for the size of "array" in bytes.
|
||||
*/
|
||||
__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
|
||||
struct gpu_array_info *array)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < array->n_index; ++i) {
|
||||
isl_ast_expr *bound;
|
||||
|
||||
prn = isl_printer_print_str(prn, "(");
|
||||
bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
|
||||
prn = isl_printer_print_ast_expr(prn, bound);
|
||||
isl_ast_expr_free(bound);
|
||||
prn = isl_printer_print_str(prn, ") * ");
|
||||
}
|
||||
prn = isl_printer_print_str(prn, "sizeof(");
|
||||
prn = isl_printer_print_str(prn, array->type);
|
||||
prn = isl_printer_print_str(prn, ")");
|
||||
|
||||
return prn;
|
||||
}
|
||||
|
||||
/* Print the declaration of a non-linearized array argument.
|
||||
*/
|
||||
static __isl_give isl_printer *print_non_linearized_declaration_argument(
|
||||
__isl_take isl_printer *p, struct gpu_array_info *array)
|
||||
{
|
||||
p = isl_printer_print_str(p, array->type);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
|
||||
p = isl_printer_print_ast_expr(p, array->bound_expr);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print the declaration of an array argument.
|
||||
* "memory_space" allows to specify a memory space prefix.
|
||||
*/
|
||||
__isl_give isl_printer *gpu_array_info_print_declaration_argument(
|
||||
__isl_take isl_printer *p, struct gpu_array_info *array,
|
||||
const char *memory_space)
|
||||
{
|
||||
if (gpu_array_is_read_only_scalar(array)) {
|
||||
p = isl_printer_print_str(p, array->type);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
return p;
|
||||
}
|
||||
|
||||
if (memory_space) {
|
||||
p = isl_printer_print_str(p, memory_space);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
}
|
||||
|
||||
if (array->n_index != 0 && !array->linearize)
|
||||
return print_non_linearized_declaration_argument(p, array);
|
||||
|
||||
p = isl_printer_print_str(p, array->type);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
p = isl_printer_print_str(p, "*");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print the call of an array argument.
|
||||
*/
|
||||
__isl_give isl_printer *gpu_array_info_print_call_argument(
|
||||
__isl_take isl_printer *p, struct gpu_array_info *array)
|
||||
{
|
||||
if (gpu_array_is_read_only_scalar(array))
|
||||
return isl_printer_print_str(p, array->name);
|
||||
|
||||
p = isl_printer_print_str(p, "dev_");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print an access to the element in the private/shared memory copy
|
||||
* described by "stmt". The index of the copy is recorded in
|
||||
* stmt->local_index as an access to the array.
|
||||
*/
|
||||
static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel_stmt *stmt)
|
||||
{
|
||||
return isl_printer_print_ast_expr(p, stmt->u.c.local_index);
|
||||
}
|
||||
|
||||
/* Print an access to the element in the global memory copy
|
||||
* described by "stmt". The index of the copy is recorded in
|
||||
* stmt->index as an access to the array.
|
||||
*/
|
||||
static __isl_give isl_printer *stmt_print_global_index(
|
||||
__isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
|
||||
{
|
||||
struct gpu_array_info *array = stmt->u.c.array;
|
||||
isl_ast_expr *index;
|
||||
|
||||
if (gpu_array_is_scalar(array)) {
|
||||
if (!gpu_array_is_read_only_scalar(array))
|
||||
p = isl_printer_print_str(p, "*");
|
||||
p = isl_printer_print_str(p, array->name);
|
||||
return p;
|
||||
}
|
||||
|
||||
index = isl_ast_expr_copy(stmt->u.c.index);
|
||||
|
||||
p = isl_printer_print_ast_expr(p, index);
|
||||
isl_ast_expr_free(index);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print a copy statement.
|
||||
*
|
||||
* A read copy statement is printed as
|
||||
*
|
||||
* local = global;
|
||||
*
|
||||
* while a write copy statement is printed as
|
||||
*
|
||||
* global = local;
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel_stmt *stmt)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
if (stmt->u.c.read) {
|
||||
p = stmt_print_local_index(p, stmt);
|
||||
p = isl_printer_print_str(p, " = ");
|
||||
p = stmt_print_global_index(p, stmt);
|
||||
} else {
|
||||
p = stmt_print_global_index(p, stmt);
|
||||
p = isl_printer_print_str(p, " = ");
|
||||
p = stmt_print_local_index(p, stmt);
|
||||
}
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel_stmt *stmt)
|
||||
{
|
||||
return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);
|
||||
}
|
||||
|
||||
/* This function is called for each node in a GPU AST.
|
||||
* In case of a user node, print the macro definitions required
|
||||
* for printing the AST expressions in the annotation, if any.
|
||||
* For other nodes, return true such that descendants are also
|
||||
* visited.
|
||||
*
|
||||
* In particular, for a kernel launch, print the macro definitions
|
||||
* needed for the grid size.
|
||||
* For a copy statement, print the macro definitions needed
|
||||
* for the two index expressions.
|
||||
* For an original user statement, print the macro definitions
|
||||
* needed for the substitutions.
|
||||
*/
|
||||
static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
|
||||
{
|
||||
const char *name;
|
||||
isl_id *id;
|
||||
int is_kernel;
|
||||
struct ppcg_kernel *kernel;
|
||||
struct ppcg_kernel_stmt *stmt;
|
||||
isl_printer **p = user;
|
||||
|
||||
if (isl_ast_node_get_type(node) != isl_ast_node_user)
|
||||
return isl_bool_true;
|
||||
|
||||
id = isl_ast_node_get_annotation(node);
|
||||
if (!id)
|
||||
return isl_bool_false;
|
||||
|
||||
name = isl_id_get_name(id);
|
||||
if (!name)
|
||||
return isl_bool_error;
|
||||
is_kernel = !strcmp(name, "kernel");
|
||||
kernel = is_kernel ? isl_id_get_user(id) : NULL;
|
||||
stmt = is_kernel ? NULL : isl_id_get_user(id);
|
||||
isl_id_free(id);
|
||||
|
||||
if ((is_kernel && !kernel) || (!is_kernel && !stmt))
|
||||
return isl_bool_error;
|
||||
|
||||
if (is_kernel) {
|
||||
*p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p);
|
||||
} else if (stmt->type == ppcg_kernel_copy) {
|
||||
*p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p);
|
||||
*p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p);
|
||||
} else if (stmt->type == ppcg_kernel_domain) {
|
||||
*p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr);
|
||||
}
|
||||
if (!*p)
|
||||
return isl_bool_error;
|
||||
|
||||
return isl_bool_false;
|
||||
}
|
||||
|
||||
/* Print the required macros for the GPU AST "node" to "p",
|
||||
* including those needed for the user statements inside the AST.
|
||||
*/
|
||||
__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
|
||||
__isl_keep isl_ast_node *node)
|
||||
{
|
||||
if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
|
||||
return isl_printer_free(p);
|
||||
p = ppcg_print_macros(p, node);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Was the definition of "type" printed before?
|
||||
* That is, does its name appear in the list of printed types "types"?
|
||||
*/
|
||||
static int already_printed(struct gpu_types *types,
|
||||
struct pet_type *type)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < types->n; ++i)
|
||||
if (!strcmp(types->name[i], type->name))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Print the definitions of all types prog->scop that have not been
|
||||
* printed before (according to "types") on "p".
|
||||
* Extend the list of printed types "types" with the newly printed types.
|
||||
*/
|
||||
__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
|
||||
struct gpu_types *types, struct gpu_prog *prog)
|
||||
{
|
||||
int i, n;
|
||||
isl_ctx *ctx;
|
||||
char **name;
|
||||
|
||||
n = prog->scop->pet->n_type;
|
||||
|
||||
if (n == 0)
|
||||
return p;
|
||||
|
||||
ctx = isl_printer_get_ctx(p);
|
||||
name = isl_realloc_array(ctx, types->name, char *, types->n + n);
|
||||
if (!name)
|
||||
return isl_printer_free(p);
|
||||
types->name = name;
|
||||
|
||||
for (i = 0; i < n; ++i) {
|
||||
struct pet_type *type = prog->scop->pet->types[i];
|
||||
|
||||
if (already_printed(types, type))
|
||||
continue;
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, type->definition);
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
types->name[types->n++] = strdup(type->name);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
28
polly/lib/External/ppcg/gpu_print.h
vendored
28
polly/lib/External/ppcg/gpu_print.h
vendored
@ -1,28 +0,0 @@
|
||||
#ifndef GPU_PRINT_H
|
||||
#define GPU_PRINT_H
|
||||
|
||||
#include "gpu.h"
|
||||
|
||||
__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
|
||||
struct gpu_prog *prog);
|
||||
|
||||
__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
|
||||
struct gpu_types *types, struct gpu_prog *prog);
|
||||
|
||||
__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
|
||||
__isl_keep isl_ast_node *node);
|
||||
|
||||
__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
|
||||
struct gpu_array_info *array);
|
||||
__isl_give isl_printer *gpu_array_info_print_declaration_argument(
|
||||
__isl_take isl_printer *p, struct gpu_array_info *array,
|
||||
const char *memory_space);
|
||||
__isl_give isl_printer *gpu_array_info_print_call_argument(
|
||||
__isl_take isl_printer *p, struct gpu_array_info *array);
|
||||
|
||||
__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel_stmt *stmt);
|
||||
__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
|
||||
struct ppcg_kernel_stmt *stmt);
|
||||
|
||||
#endif
|
||||
640
polly/lib/External/ppcg/gpu_tree.c
vendored
640
polly/lib/External/ppcg/gpu_tree.c
vendored
@ -1,640 +0,0 @@
|
||||
/*
|
||||
* Copyright 2013 Ecole Normale Superieure
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege,
|
||||
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <isl/set.h>
|
||||
#include <isl/union_set.h>
|
||||
#include <isl/space.h>
|
||||
|
||||
#include "gpu_tree.h"
|
||||
|
||||
/* The functions in this file are used to navigate part of a schedule tree
|
||||
* that is mapped to blocks. Initially, this part consists of a linear
|
||||
* branch segment with a mark node with name "kernel" on the outer end
|
||||
* and a mark node with name "thread" on the inner end.
|
||||
* During the mapping to blocks, branching may be introduced, but only
|
||||
* one of the elements in each sequence contains the "thread" mark.
|
||||
* The filter of this element (and only this filter) contains
|
||||
* domain elements identified by the "core" argument of the functions
|
||||
* that move down this tree.
|
||||
*
|
||||
* Synchronization statements have a name that starts with "sync" and
|
||||
* a user pointer pointing to the kernel that contains the synchronization.
|
||||
* The functions inserting or detecting synchronizations take a ppcg_kernel
|
||||
* argument to be able to create or identify such statements.
|
||||
* They may also use two fields in this structure, the "core" field
|
||||
* to move around in the tree and the "n_sync" field to make sure that
|
||||
* each synchronization has a different name (within the kernel).
|
||||
*/
|
||||
|
||||
/* Is "node" a mark node with an identifier called "name"?
|
||||
*/
|
||||
static int is_marked(__isl_keep isl_schedule_node *node, const char *name)
|
||||
{
|
||||
isl_id *mark;
|
||||
int has_name;
|
||||
|
||||
if (!node)
|
||||
return -1;
|
||||
|
||||
if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
|
||||
return 0;
|
||||
|
||||
mark = isl_schedule_node_mark_get_id(node);
|
||||
if (!mark)
|
||||
return -1;
|
||||
|
||||
has_name = !strcmp(isl_id_get_name(mark), name);
|
||||
isl_id_free(mark);
|
||||
|
||||
return has_name;
|
||||
}
|
||||
|
||||
/* Is "node" a mark node with an identifier called "kernel"?
|
||||
*/
|
||||
int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node)
|
||||
{
|
||||
return is_marked(node, "kernel");
|
||||
}
|
||||
|
||||
/* Is "node" a mark node with an identifier called "shared"?
|
||||
*/
|
||||
static int node_is_shared(__isl_keep isl_schedule_node *node)
|
||||
{
|
||||
return is_marked(node, "shared");
|
||||
}
|
||||
|
||||
/* Is "node" a mark node with an identifier called "thread"?
|
||||
*/
|
||||
static int node_is_thread(__isl_keep isl_schedule_node *node)
|
||||
{
|
||||
return is_marked(node, "thread");
|
||||
}
|
||||
|
||||
/* Insert a mark node with identifier "shared" in front of "node".
|
||||
*/
|
||||
static __isl_give isl_schedule_node *insert_shared(
|
||||
__isl_take isl_schedule_node *node)
|
||||
{
|
||||
isl_ctx *ctx;
|
||||
isl_id *id;
|
||||
|
||||
ctx = isl_schedule_node_get_ctx(node);
|
||||
id = isl_id_alloc(ctx, "shared", NULL);
|
||||
node = isl_schedule_node_insert_mark(node, id);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Insert a "shared" mark in front of the "thread" mark
|
||||
* provided the linear branch between "node" and the "thread" mark
|
||||
* does not contain such a "shared" mark already.
|
||||
*
|
||||
* As a side effect, this function checks that the subtree at "node"
|
||||
* actually contains a "thread" mark and that there is no branching
|
||||
* in between "node" and this "thread" mark.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
|
||||
__isl_take isl_schedule_node *node)
|
||||
{
|
||||
int depth0, depth;
|
||||
int any_shared = 0;
|
||||
|
||||
if (!node)
|
||||
return NULL;
|
||||
|
||||
depth0 = isl_schedule_node_get_tree_depth(node);
|
||||
|
||||
for (;;) {
|
||||
int is_thread;
|
||||
int n;
|
||||
|
||||
if (!any_shared) {
|
||||
any_shared = node_is_shared(node);
|
||||
if (any_shared < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
}
|
||||
is_thread = node_is_thread(node);
|
||||
if (is_thread < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (is_thread)
|
||||
break;
|
||||
n = isl_schedule_node_n_children(node);
|
||||
if (n == 0)
|
||||
isl_die(isl_schedule_node_get_ctx(node),
|
||||
isl_error_invalid,
|
||||
"no thread marker found",
|
||||
return isl_schedule_node_free(node));
|
||||
if (n > 1)
|
||||
isl_die(isl_schedule_node_get_ctx(node),
|
||||
isl_error_invalid,
|
||||
"expecting single thread marker",
|
||||
return isl_schedule_node_free(node));
|
||||
|
||||
node = isl_schedule_node_child(node, 0);
|
||||
}
|
||||
|
||||
if (!any_shared)
|
||||
node = insert_shared(node);
|
||||
depth = isl_schedule_node_get_tree_depth(node);
|
||||
node = isl_schedule_node_ancestor(node, depth - depth0);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Assuming "node" is a filter node, does it correspond to the branch
|
||||
* that contains the "thread" mark, i.e., does it contain any elements
|
||||
* in "core"?
|
||||
*/
|
||||
static int node_is_core(__isl_keep isl_schedule_node *node,
|
||||
__isl_keep isl_union_set *core)
|
||||
{
|
||||
int disjoint;
|
||||
isl_union_set *filter;
|
||||
|
||||
filter = isl_schedule_node_filter_get_filter(node);
|
||||
disjoint = isl_union_set_is_disjoint(filter, core);
|
||||
isl_union_set_free(filter);
|
||||
if (disjoint < 0)
|
||||
return -1;
|
||||
|
||||
return !disjoint;
|
||||
}
|
||||
|
||||
/* Move to the only child of "node" that has the "thread" mark as descendant,
|
||||
* where the branch containing this mark is identified by the domain elements
|
||||
* in "core".
|
||||
*
|
||||
* If "node" is not a sequence, then it only has one child and we move
|
||||
* to that single child.
|
||||
* Otherwise, we check each of the filters in the children, pick
|
||||
* the one that corresponds to "core" and return a pointer to the child
|
||||
* of the filter node.
|
||||
*/
|
||||
static __isl_give isl_schedule_node *core_child(
|
||||
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
|
||||
{
|
||||
int i, n;
|
||||
|
||||
if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
|
||||
return isl_schedule_node_child(node, 0);
|
||||
|
||||
n = isl_schedule_node_n_children(node);
|
||||
for (i = 0; i < n; ++i) {
|
||||
int is_core;
|
||||
|
||||
node = isl_schedule_node_child(node, i);
|
||||
is_core = node_is_core(node, core);
|
||||
|
||||
if (is_core < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (is_core)
|
||||
return isl_schedule_node_child(node, 0);
|
||||
|
||||
node = isl_schedule_node_parent(node);
|
||||
}
|
||||
|
||||
isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
|
||||
"core child not found", return isl_schedule_node_free(node));
|
||||
}
|
||||
|
||||
/* Move down the branch between "kernel" and "thread" until
|
||||
* the "shared" mark is reached, where the branch containing the "shared"
|
||||
* mark is identified by the domain elements in "core".
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
|
||||
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
|
||||
{
|
||||
int is_shared;
|
||||
|
||||
while ((is_shared = node_is_shared(node)) == 0)
|
||||
node = core_child(node, core);
|
||||
if (is_shared < 0)
|
||||
node = isl_schedule_node_free(node);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Move down the branch between "kernel" and "thread" until
|
||||
* the "thread" mark is reached, where the branch containing the "thread"
|
||||
* mark is identified by the domain elements in "core".
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
|
||||
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
|
||||
{
|
||||
int is_thread;
|
||||
|
||||
while ((is_thread = node_is_thread(node)) == 0)
|
||||
node = core_child(node, core);
|
||||
if (is_thread < 0)
|
||||
node = isl_schedule_node_free(node);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Move up the tree underneath the "thread" mark until
|
||||
* the "thread" mark is reached.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
|
||||
__isl_take isl_schedule_node *node)
|
||||
{
|
||||
int is_thread;
|
||||
|
||||
while ((is_thread = node_is_thread(node)) == 0)
|
||||
node = isl_schedule_node_parent(node);
|
||||
if (is_thread < 0)
|
||||
node = isl_schedule_node_free(node);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Move up the tree underneath the "kernel" mark until
|
||||
* the "kernel" mark is reached.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
|
||||
__isl_take isl_schedule_node *node)
|
||||
{
|
||||
int is_kernel;
|
||||
|
||||
while ((is_kernel = gpu_tree_node_is_kernel(node)) == 0)
|
||||
node = isl_schedule_node_parent(node);
|
||||
if (is_kernel < 0)
|
||||
node = isl_schedule_node_free(node);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Move down from the "kernel" mark (or at least a node with schedule
|
||||
* depth smaller than or equal to "depth") to a band node at schedule
|
||||
* depth "depth". The "thread" mark is assumed to have a schedule
|
||||
* depth greater than or equal to "depth". The branch containing the
|
||||
* "thread" mark is identified by the domain elements in "core".
|
||||
*
|
||||
* If the desired schedule depth is in the middle of band node,
|
||||
* then the band node is split into two pieces, the second piece
|
||||
* at the desired schedule depth.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
|
||||
__isl_take isl_schedule_node *node, int depth,
|
||||
__isl_keep isl_union_set *core)
|
||||
{
|
||||
int is_shared;
|
||||
int is_thread = 0;
|
||||
|
||||
while (node && isl_schedule_node_get_schedule_depth(node) < depth) {
|
||||
if (isl_schedule_node_get_type(node) ==
|
||||
isl_schedule_node_band) {
|
||||
int node_depth, node_dim;
|
||||
node_depth = isl_schedule_node_get_schedule_depth(node);
|
||||
node_dim = isl_schedule_node_band_n_member(node);
|
||||
if (node_depth + node_dim > depth)
|
||||
node = isl_schedule_node_band_split(node,
|
||||
depth - node_depth);
|
||||
}
|
||||
node = core_child(node, core);
|
||||
}
|
||||
while ((is_shared = node_is_shared(node)) == 0 &&
|
||||
(is_thread = node_is_thread(node)) == 0 &&
|
||||
isl_schedule_node_get_type(node) != isl_schedule_node_band)
|
||||
node = core_child(node, core);
|
||||
if (is_shared < 0 || is_thread < 0)
|
||||
node = isl_schedule_node_free(node);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Create a union set containing a single set with a tuple identifier
|
||||
* called "syncX" and user pointer equal to "kernel".
|
||||
*/
|
||||
static __isl_give isl_union_set *create_sync_domain(struct ppcg_kernel *kernel)
|
||||
{
|
||||
isl_space *space;
|
||||
isl_id *id;
|
||||
char name[40];
|
||||
|
||||
space = isl_space_set_alloc(kernel->ctx, 0, 0);
|
||||
snprintf(name, sizeof(name), "sync%d", kernel->n_sync++);
|
||||
id = isl_id_alloc(kernel->ctx, name, kernel);
|
||||
space = isl_space_set_tuple_id(space, isl_dim_set, id);
|
||||
return isl_union_set_from_set(isl_set_universe(space));
|
||||
}
|
||||
|
||||
/* Is "id" the identifier of a synchronization statement inside "kernel"?
|
||||
* That is, does its name start with "sync" and does it point to "kernel"?
|
||||
*/
|
||||
int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel)
|
||||
{
|
||||
const char *name;
|
||||
|
||||
name = isl_id_get_name(id);
|
||||
if (!name)
|
||||
return 0;
|
||||
else if (strncmp(name, "sync", 4))
|
||||
return 0;
|
||||
return isl_id_get_user(id) == kernel;
|
||||
}
|
||||
|
||||
/* Does "domain" consist of a single set with a tuple identifier
|
||||
* corresponding to a synchronization for "kernel"?
|
||||
*/
|
||||
static int domain_is_sync(__isl_keep isl_union_set *domain,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int is_sync;
|
||||
isl_id *id;
|
||||
isl_set *set;
|
||||
|
||||
if (isl_union_set_n_set(domain) != 1)
|
||||
return 0;
|
||||
set = isl_set_from_union_set(isl_union_set_copy(domain));
|
||||
id = isl_set_get_tuple_id(set);
|
||||
is_sync = gpu_tree_id_is_sync(id, kernel);
|
||||
isl_id_free(id);
|
||||
isl_set_free(set);
|
||||
|
||||
return is_sync;
|
||||
}
|
||||
|
||||
/* Does "node" point to a filter selecting a synchronization statement
|
||||
* for "kernel"?
|
||||
*/
|
||||
static int node_is_sync_filter(__isl_keep isl_schedule_node *node,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int is_sync;
|
||||
enum isl_schedule_node_type type;
|
||||
isl_union_set *domain;
|
||||
|
||||
if (!node)
|
||||
return -1;
|
||||
type = isl_schedule_node_get_type(node);
|
||||
if (type != isl_schedule_node_filter)
|
||||
return 0;
|
||||
domain = isl_schedule_node_filter_get_filter(node);
|
||||
is_sync = domain_is_sync(domain, kernel);
|
||||
isl_union_set_free(domain);
|
||||
|
||||
return is_sync;
|
||||
}
|
||||
|
||||
/* Is "node" part of a sequence with a previous synchronization statement
|
||||
* for "kernel"?
|
||||
* That is, is the parent of "node" a filter such that there is
|
||||
* a previous filter that picks out exactly such a synchronization statement?
|
||||
*/
|
||||
static int has_preceding_sync(__isl_keep isl_schedule_node *node,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int found = 0;
|
||||
|
||||
node = isl_schedule_node_copy(node);
|
||||
node = isl_schedule_node_parent(node);
|
||||
while (!found && isl_schedule_node_has_previous_sibling(node)) {
|
||||
node = isl_schedule_node_previous_sibling(node);
|
||||
if (!node)
|
||||
break;
|
||||
found = node_is_sync_filter(node, kernel);
|
||||
}
|
||||
if (!node)
|
||||
found = -1;
|
||||
isl_schedule_node_free(node);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/* Is "node" part of a sequence with a subsequent synchronization statement
|
||||
* for "kernel"?
|
||||
* That is, is the parent of "node" a filter such that there is
|
||||
* a subsequent filter that picks out exactly such a synchronization statement?
|
||||
*/
|
||||
static int has_following_sync(__isl_keep isl_schedule_node *node,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int found = 0;
|
||||
|
||||
node = isl_schedule_node_copy(node);
|
||||
node = isl_schedule_node_parent(node);
|
||||
while (!found && isl_schedule_node_has_next_sibling(node)) {
|
||||
node = isl_schedule_node_next_sibling(node);
|
||||
if (!node)
|
||||
break;
|
||||
found = node_is_sync_filter(node, kernel);
|
||||
}
|
||||
if (!node)
|
||||
found = -1;
|
||||
isl_schedule_node_free(node);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/* Does the subtree rooted at "node" (which is a band node) contain
|
||||
* any synchronization statement for "kernel" that precedes
|
||||
* the core computation of "kernel" (identified by the elements
|
||||
* in kernel->core)?
|
||||
*/
|
||||
static int has_sync_before_core(__isl_keep isl_schedule_node *node,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int has_sync = 0;
|
||||
int is_thread;
|
||||
|
||||
node = isl_schedule_node_copy(node);
|
||||
while ((is_thread = node_is_thread(node)) == 0) {
|
||||
node = core_child(node, kernel->core);
|
||||
has_sync = has_preceding_sync(node, kernel);
|
||||
if (has_sync < 0 || has_sync)
|
||||
break;
|
||||
}
|
||||
if (is_thread < 0 || !node)
|
||||
has_sync = -1;
|
||||
isl_schedule_node_free(node);
|
||||
|
||||
return has_sync;
|
||||
}
|
||||
|
||||
/* Does the subtree rooted at "node" (which is a band node) contain
|
||||
* any synchronization statement for "kernel" that follows
|
||||
* the core computation of "kernel" (identified by the elements
|
||||
* in kernel->core)?
|
||||
*/
|
||||
static int has_sync_after_core(__isl_keep isl_schedule_node *node,
|
||||
struct ppcg_kernel *kernel)
|
||||
{
|
||||
int has_sync = 0;
|
||||
int is_thread;
|
||||
|
||||
node = isl_schedule_node_copy(node);
|
||||
while ((is_thread = node_is_thread(node)) == 0) {
|
||||
node = core_child(node, kernel->core);
|
||||
has_sync = has_following_sync(node, kernel);
|
||||
if (has_sync < 0 || has_sync)
|
||||
break;
|
||||
}
|
||||
if (is_thread < 0 || !node)
|
||||
has_sync = -1;
|
||||
isl_schedule_node_free(node);
|
||||
|
||||
return has_sync;
|
||||
}
|
||||
|
||||
/* Insert (or extend) an extension on top of "node" that puts
|
||||
* a synchronization node for "kernel" before "node".
|
||||
* Return a pointer to the original node in the updated schedule tree.
|
||||
*/
|
||||
static __isl_give isl_schedule_node *insert_sync_before(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
|
||||
{
|
||||
isl_union_set *domain;
|
||||
isl_schedule_node *graft;
|
||||
|
||||
if (!node)
|
||||
return NULL;
|
||||
|
||||
domain = create_sync_domain(kernel);
|
||||
graft = isl_schedule_node_from_domain(domain);
|
||||
node = isl_schedule_node_graft_before(node, graft);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Insert (or extend) an extension on top of "node" that puts
|
||||
* a synchronization node for "kernel" afater "node".
|
||||
* Return a pointer to the original node in the updated schedule tree.
|
||||
*/
|
||||
static __isl_give isl_schedule_node *insert_sync_after(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
|
||||
{
|
||||
isl_union_set *domain;
|
||||
isl_schedule_node *graft;
|
||||
|
||||
if (!node)
|
||||
return NULL;
|
||||
|
||||
domain = create_sync_domain(kernel);
|
||||
graft = isl_schedule_node_from_domain(domain);
|
||||
node = isl_schedule_node_graft_after(node, graft);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Insert an extension on top of "node" that puts a synchronization node
|
||||
* for "kernel" before "node" unless there already is
|
||||
* such a synchronization node.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_ensure_preceding_sync(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
|
||||
{
|
||||
int has_sync;
|
||||
|
||||
has_sync = has_preceding_sync(node, kernel);
|
||||
if (has_sync < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (has_sync)
|
||||
return node;
|
||||
return insert_sync_before(node, kernel);
|
||||
}
|
||||
|
||||
/* Insert an extension on top of "node" that puts a synchronization node
|
||||
* for "kernel" after "node" unless there already is
|
||||
* such a synchronization node.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
|
||||
{
|
||||
int has_sync;
|
||||
|
||||
has_sync = has_following_sync(node, kernel);
|
||||
if (has_sync < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (has_sync)
|
||||
return node;
|
||||
return insert_sync_after(node, kernel);
|
||||
}
|
||||
|
||||
/* Insert an extension on top of "node" that puts a synchronization node
|
||||
* for "kernel" after "node" unless there already is such a sync node or
|
||||
* "node" itself already * contains a synchronization node following
|
||||
* the core computation of "kernel".
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
|
||||
{
|
||||
int has_sync;
|
||||
|
||||
has_sync = has_sync_after_core(node, kernel);
|
||||
if (has_sync < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (has_sync)
|
||||
return node;
|
||||
has_sync = has_following_sync(node, kernel);
|
||||
if (has_sync < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (has_sync)
|
||||
return node;
|
||||
return insert_sync_after(node, kernel);
|
||||
}
|
||||
|
||||
/* Move left in the sequence on top of "node" to a synchronization node
|
||||
* for "kernel".
|
||||
* If "node" itself contains a synchronization node preceding
|
||||
* the core computation of "kernel", then return "node" itself.
|
||||
* Otherwise, if "node" does not have a preceding synchronization node,
|
||||
* then create one first.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
|
||||
{
|
||||
int has_sync;
|
||||
int is_sync;
|
||||
|
||||
has_sync = has_sync_before_core(node, kernel);
|
||||
if (has_sync < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (has_sync)
|
||||
return node;
|
||||
node = gpu_tree_ensure_preceding_sync(node, kernel);
|
||||
node = isl_schedule_node_parent(node);
|
||||
while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
|
||||
node = isl_schedule_node_previous_sibling(node);
|
||||
if (is_sync < 0)
|
||||
node = isl_schedule_node_free(node);
|
||||
node = isl_schedule_node_child(node, 0);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
/* Move right in the sequence on top of "node" to a synchronization node
|
||||
* for "kernel".
|
||||
* If "node" itself contains a synchronization node following
|
||||
* the core computation of "kernel", then return "node" itself.
|
||||
* Otherwise, if "node" does not have a following synchronization node,
|
||||
* then create one first.
|
||||
*/
|
||||
__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
|
||||
{
|
||||
int has_sync;
|
||||
int is_sync;
|
||||
|
||||
has_sync = has_sync_after_core(node, kernel);
|
||||
if (has_sync < 0)
|
||||
return isl_schedule_node_free(node);
|
||||
if (has_sync)
|
||||
return node;
|
||||
node = gpu_tree_ensure_following_sync(node, kernel);
|
||||
node = isl_schedule_node_parent(node);
|
||||
while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
|
||||
node = isl_schedule_node_next_sibling(node);
|
||||
if (is_sync < 0)
|
||||
node = isl_schedule_node_free(node);
|
||||
node = isl_schedule_node_child(node, 0);
|
||||
|
||||
return node;
|
||||
}
|
||||
33
polly/lib/External/ppcg/gpu_tree.h
vendored
33
polly/lib/External/ppcg/gpu_tree.h
vendored
@ -1,33 +0,0 @@
|
||||
#ifndef GPU_TREE_H
|
||||
#define GPU_TREE_H
|
||||
|
||||
#include <isl/schedule_node.h>
|
||||
|
||||
#include "gpu.h"
|
||||
|
||||
__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
|
||||
__isl_take isl_schedule_node *node);
|
||||
int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node);
|
||||
__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
|
||||
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
|
||||
__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
|
||||
__isl_take isl_schedule_node *node);
|
||||
__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
|
||||
__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
|
||||
__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
|
||||
__isl_take isl_schedule_node *node);
|
||||
__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
|
||||
__isl_take isl_schedule_node *node, int depth,
|
||||
__isl_keep isl_union_set *core);
|
||||
|
||||
int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel);
|
||||
__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
|
||||
__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
|
||||
__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
|
||||
__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
|
||||
__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
|
||||
|
||||
#endif
|
||||
684
polly/lib/External/ppcg/grouping.c
vendored
684
polly/lib/External/ppcg/grouping.c
vendored
@ -1,684 +0,0 @@
|
||||
/*
|
||||
* Copyright 2016 Sven Verdoolaege
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege.
|
||||
*/
|
||||
|
||||
#include <isl/ctx.h>
|
||||
#include <isl/id.h>
|
||||
#include <isl/val.h>
|
||||
#include <isl/space.h>
|
||||
#include <isl/aff.h>
|
||||
#include <isl/set.h>
|
||||
#include <isl/map.h>
|
||||
#include <isl/union_set.h>
|
||||
#include <isl/union_map.h>
|
||||
#include <isl/schedule.h>
|
||||
#include <isl/schedule_node.h>
|
||||
|
||||
#include "ppcg.h"
|
||||
|
||||
/* Internal data structure for use during the detection of statements
|
||||
* that can be grouped.
|
||||
*
|
||||
* "sc" contains the original schedule constraints (not a copy).
|
||||
* "dep" contains the intersection of the validity and the proximity
|
||||
* constraints in "sc". It may be NULL if it has not been computed yet.
|
||||
* "group_id" is the identifier for the next group that is extracted.
|
||||
*
|
||||
* "domain" is the set of statement instances that belong to any of the groups.
|
||||
* "contraction" maps the elements of "domain" to the corresponding group
|
||||
* instances.
|
||||
* "schedule" schedules the statements in each group relatively to each other.
|
||||
* These last three fields are NULL if no groups have been found so far.
|
||||
*/
|
||||
struct ppcg_grouping {
|
||||
isl_schedule_constraints *sc;
|
||||
|
||||
isl_union_map *dep;
|
||||
int group_id;
|
||||
|
||||
isl_union_set *domain;
|
||||
isl_union_pw_multi_aff *contraction;
|
||||
isl_schedule *schedule;
|
||||
};
|
||||
|
||||
/* Clear all memory allocated by "grouping".
|
||||
*/
|
||||
static void ppcg_grouping_clear(struct ppcg_grouping *grouping)
|
||||
{
|
||||
isl_union_map_free(grouping->dep);
|
||||
isl_union_set_free(grouping->domain);
|
||||
isl_union_pw_multi_aff_free(grouping->contraction);
|
||||
isl_schedule_free(grouping->schedule);
|
||||
}
|
||||
|
||||
/* Compute the intersection of the proximity and validity dependences
|
||||
* in grouping->sc and store the result in grouping->dep, unless
|
||||
* this intersection has been computed before.
|
||||
*/
|
||||
static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping)
|
||||
{
|
||||
isl_union_map *validity, *proximity;
|
||||
|
||||
if (grouping->dep)
|
||||
return isl_stat_ok;
|
||||
|
||||
validity = isl_schedule_constraints_get_validity(grouping->sc);
|
||||
proximity = isl_schedule_constraints_get_proximity(grouping->sc);
|
||||
grouping->dep = isl_union_map_intersect(validity, proximity);
|
||||
|
||||
if (!grouping->dep)
|
||||
return isl_stat_error;
|
||||
|
||||
return isl_stat_ok;
|
||||
}
|
||||
|
||||
/* Information extracted from one or more consecutive leaves
|
||||
* in the input schedule.
|
||||
*
|
||||
* "list" contains the sets of statement instances in the leaves,
|
||||
* one element in the list for each original leaf.
|
||||
* "domain" contains the union of the sets in "list".
|
||||
* "prefix" contains the prefix schedule of these elements.
|
||||
*/
|
||||
struct ppcg_grouping_leaf {
|
||||
isl_union_set *domain;
|
||||
isl_union_set_list *list;
|
||||
isl_multi_union_pw_aff *prefix;
|
||||
};
|
||||
|
||||
/* Free all memory allocated for "leaves".
|
||||
*/
|
||||
static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[])
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!leaves)
|
||||
return;
|
||||
|
||||
for (i = 0; i < n; ++i) {
|
||||
isl_union_set_free(leaves[i].domain);
|
||||
isl_union_set_list_free(leaves[i].list);
|
||||
isl_multi_union_pw_aff_free(leaves[i].prefix);
|
||||
}
|
||||
|
||||
free(leaves);
|
||||
}
|
||||
|
||||
/* Short-hand for retrieving the prefix schedule at "node"
|
||||
* in the form of an isl_multi_union_pw_aff.
|
||||
*/
|
||||
static __isl_give isl_multi_union_pw_aff *get_prefix(
|
||||
__isl_keep isl_schedule_node *node)
|
||||
{
|
||||
return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
|
||||
}
|
||||
|
||||
/* Return an array of "n" elements with information extracted from
|
||||
* the "n" children of "node" starting at "first", all of which
|
||||
* are known to be filtered leaves.
|
||||
*/
|
||||
struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node,
|
||||
int first, int n)
|
||||
{
|
||||
int i;
|
||||
isl_ctx *ctx;
|
||||
struct ppcg_grouping_leaf *leaves;
|
||||
|
||||
if (!node)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_schedule_node_get_ctx(node);
|
||||
leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n);
|
||||
if (!leaves)
|
||||
return NULL;
|
||||
|
||||
for (i = 0; i < n; ++i) {
|
||||
isl_schedule_node *child;
|
||||
isl_union_set *domain;
|
||||
|
||||
child = isl_schedule_node_get_child(node, first + i);
|
||||
child = isl_schedule_node_child(child, 0);
|
||||
domain = isl_schedule_node_get_domain(child);
|
||||
leaves[i].domain = isl_union_set_copy(domain);
|
||||
leaves[i].list = isl_union_set_list_from_union_set(domain);
|
||||
leaves[i].prefix = get_prefix(child);
|
||||
isl_schedule_node_free(child);
|
||||
}
|
||||
|
||||
return leaves;
|
||||
}
|
||||
|
||||
/* Internal data structure used by merge_leaves.
|
||||
*
|
||||
* "src" and "dst" point to the two consecutive leaves that are
|
||||
* under investigation for being merged.
|
||||
* "merge" is initially set to 0 and is set to 1 as soon as
|
||||
* it turns out that it is useful to merge the two leaves.
|
||||
*/
|
||||
struct ppcg_merge_leaves_data {
|
||||
int merge;
|
||||
struct ppcg_grouping_leaf *src;
|
||||
struct ppcg_grouping_leaf *dst;
|
||||
};
|
||||
|
||||
/* Given a relation "map" between instances of two statements A and B,
|
||||
* does it relate every instance of A (according to the domain of "src")
|
||||
* to every instance of B (according to the domain of "dst")?
|
||||
*/
|
||||
static isl_bool covers_src_and_dst(__isl_keep isl_map *map,
|
||||
struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
|
||||
{
|
||||
isl_space *space;
|
||||
isl_set *set1, *set2;
|
||||
isl_bool is_subset;
|
||||
|
||||
space = isl_space_domain(isl_map_get_space(map));
|
||||
set1 = isl_union_set_extract_set(src->domain, space);
|
||||
set2 = isl_map_domain(isl_map_copy(map));
|
||||
is_subset = isl_set_is_subset(set1, set2);
|
||||
isl_set_free(set1);
|
||||
isl_set_free(set2);
|
||||
if (is_subset < 0 || !is_subset)
|
||||
return is_subset;
|
||||
|
||||
space = isl_space_range(isl_map_get_space(map));
|
||||
set1 = isl_union_set_extract_set(dst->domain, space);
|
||||
set2 = isl_map_range(isl_map_copy(map));
|
||||
is_subset = isl_set_is_subset(set1, set2);
|
||||
isl_set_free(set1);
|
||||
isl_set_free(set2);
|
||||
|
||||
return is_subset;
|
||||
}
|
||||
|
||||
/* Given a relation "map" between instances of two statements A and B,
|
||||
* are pairs of related instances executed together in the input schedule?
|
||||
* That is, is each pair of instances assigned the same value
|
||||
* by the corresponding prefix schedules?
|
||||
*
|
||||
* In particular, select the subset of "map" that has pairs of elements
|
||||
* with the same value for the prefix schedules and then check
|
||||
* if "map" is still a subset of the result.
|
||||
*/
|
||||
static isl_bool matches_prefix(__isl_keep isl_map *map,
|
||||
struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
|
||||
{
|
||||
isl_union_map *umap, *equal;
|
||||
isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix;
|
||||
isl_bool is_subset;
|
||||
|
||||
src_prefix = isl_multi_union_pw_aff_copy(src->prefix);
|
||||
dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix);
|
||||
prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix);
|
||||
|
||||
umap = isl_union_map_from_map(isl_map_copy(map));
|
||||
equal = isl_union_map_copy(umap);
|
||||
equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix);
|
||||
|
||||
is_subset = isl_union_map_is_subset(umap, equal);
|
||||
|
||||
isl_union_map_free(umap);
|
||||
isl_union_map_free(equal);
|
||||
|
||||
return is_subset;
|
||||
}
|
||||
|
||||
/* Given a set of validity and proximity schedule constraints "map"
|
||||
* between statements in consecutive leaves in a valid schedule,
|
||||
* should the two leaves be merged into one?
|
||||
*
|
||||
* In particular, the two are merged if the constraints form
|
||||
* a bijection between every instance of the first statement and
|
||||
* every instance of the second statement. Moreover, each
|
||||
* pair of such dependent instances needs to be executed consecutively
|
||||
* in the input schedule. That is, they need to be assigned
|
||||
* the same value by their prefix schedules.
|
||||
*
|
||||
* What this means is that for each instance of the first statement
|
||||
* there is exactly one instance of the second statement that
|
||||
* is executed immediately after the instance of the first statement and
|
||||
* that, moreover, both depends on this statement instance and
|
||||
* should be brought as close as possible to this statement instance.
|
||||
* In other words, it is both possible to execute the two instances
|
||||
* together (according to the input schedule) and desirable to do so
|
||||
* (according to the validity and proximity schedule constraints).
|
||||
*/
|
||||
static isl_stat check_merge(__isl_take isl_map *map, void *user)
|
||||
{
|
||||
struct ppcg_merge_leaves_data *data = user;
|
||||
isl_bool ok;
|
||||
|
||||
ok = covers_src_and_dst(map, data->src, data->dst);
|
||||
if (ok >= 0 && ok)
|
||||
ok = isl_map_is_bijective(map);
|
||||
if (ok >= 0 && ok)
|
||||
ok = matches_prefix(map, data->src, data->dst);
|
||||
|
||||
isl_map_free(map);
|
||||
|
||||
if (ok < 0)
|
||||
return isl_stat_error;
|
||||
if (!ok)
|
||||
return isl_stat_ok;
|
||||
|
||||
data->merge = 1;
|
||||
return isl_stat_error;
|
||||
}
|
||||
|
||||
/* Merge the leaves at position "pos" and "pos + 1" in "leaves".
|
||||
*/
|
||||
static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[], int pos)
|
||||
{
|
||||
int i;
|
||||
|
||||
leaves[pos].domain = isl_union_set_union(leaves[pos].domain,
|
||||
leaves[pos + 1].domain);
|
||||
leaves[pos].list = isl_union_set_list_concat(leaves[pos].list,
|
||||
leaves[pos + 1].list);
|
||||
leaves[pos].prefix = isl_multi_union_pw_aff_union_add(
|
||||
leaves[pos].prefix, leaves[pos + 1].prefix);
|
||||
for (i = pos + 1; i + 1 < n; ++i)
|
||||
leaves[i] = leaves[i + 1];
|
||||
leaves[n - 1].domain = NULL;
|
||||
leaves[n - 1].list = NULL;
|
||||
leaves[n - 1].prefix = NULL;
|
||||
|
||||
if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix)
|
||||
return isl_stat_error;
|
||||
|
||||
return isl_stat_ok;
|
||||
}
|
||||
|
||||
/* Merge pairs of consecutive leaves in "leaves" taking into account
|
||||
* the intersection of validity and proximity schedule constraints "dep".
|
||||
*
|
||||
* If a leaf has been merged with the next leaf, then the combination
|
||||
* is checked again for merging with the next leaf.
|
||||
* That is, if the leaves are A, B and C, then B may not have been
|
||||
* merged with C, but after merging A and B, it could still be useful
|
||||
* to merge the combination AB with C.
|
||||
*
|
||||
* Two leaves A and B are merged if there are instances of at least
|
||||
* one pair of statements, one statement in A and one B, such that
|
||||
* the validity and proximity schedule constraints between them
|
||||
* make them suitable for merging according to check_merge.
|
||||
*
|
||||
* Return the final number of leaves in the sequence, or -1 on error.
|
||||
*/
|
||||
static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[],
|
||||
__isl_keep isl_union_map *dep)
|
||||
{
|
||||
int i;
|
||||
struct ppcg_merge_leaves_data data;
|
||||
|
||||
for (i = n - 1; i >= 0; --i) {
|
||||
isl_union_map *dep_i;
|
||||
isl_stat ok;
|
||||
|
||||
if (i + 1 >= n)
|
||||
continue;
|
||||
|
||||
dep_i = isl_union_map_copy(dep);
|
||||
dep_i = isl_union_map_intersect_domain(dep_i,
|
||||
isl_union_set_copy(leaves[i].domain));
|
||||
dep_i = isl_union_map_intersect_range(dep_i,
|
||||
isl_union_set_copy(leaves[i + 1].domain));
|
||||
data.merge = 0;
|
||||
data.src = &leaves[i];
|
||||
data.dst = &leaves[i + 1];
|
||||
ok = isl_union_map_foreach_map(dep_i, &check_merge, &data);
|
||||
isl_union_map_free(dep_i);
|
||||
if (ok < 0 && !data.merge)
|
||||
return -1;
|
||||
if (!data.merge)
|
||||
continue;
|
||||
if (merge_pair(n, leaves, i) < 0)
|
||||
return -1;
|
||||
--n;
|
||||
++i;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/* Construct a schedule with "domain" as domain, that executes
|
||||
* the elements of "list" in order (as a sequence).
|
||||
*/
|
||||
static __isl_give isl_schedule *schedule_from_domain_and_list(
|
||||
__isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list)
|
||||
{
|
||||
isl_schedule *schedule;
|
||||
isl_schedule_node *node;
|
||||
|
||||
schedule = isl_schedule_from_domain(isl_union_set_copy(domain));
|
||||
node = isl_schedule_get_root(schedule);
|
||||
isl_schedule_free(schedule);
|
||||
node = isl_schedule_node_child(node, 0);
|
||||
list = isl_union_set_list_copy(list);
|
||||
node = isl_schedule_node_insert_sequence(node, list);
|
||||
schedule = isl_schedule_node_get_schedule(node);
|
||||
isl_schedule_node_free(node);
|
||||
|
||||
return schedule;
|
||||
}
|
||||
|
||||
/* Construct a unique identifier for a group in "grouping".
|
||||
*
|
||||
* The name is of the form G_n, with n the first value starting at
|
||||
* grouping->group_id that does not result in an identifier
|
||||
* that is already in use in the domain of the original schedule
|
||||
* constraints.
|
||||
*/
|
||||
static isl_id *construct_group_id(struct ppcg_grouping *grouping,
|
||||
__isl_take isl_space *space)
|
||||
{
|
||||
isl_ctx *ctx;
|
||||
isl_id *id;
|
||||
isl_bool empty;
|
||||
isl_union_set *domain;
|
||||
|
||||
if (!space)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_space_get_ctx(space);
|
||||
domain = isl_schedule_constraints_get_domain(grouping->sc);
|
||||
|
||||
do {
|
||||
char buffer[20];
|
||||
isl_id *id;
|
||||
isl_set *set;
|
||||
|
||||
snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id);
|
||||
grouping->group_id++;
|
||||
id = isl_id_alloc(ctx, buffer, NULL);
|
||||
space = isl_space_set_tuple_id(space, isl_dim_set, id);
|
||||
set = isl_union_set_extract_set(domain, isl_space_copy(space));
|
||||
empty = isl_set_plain_is_empty(set);
|
||||
isl_set_free(set);
|
||||
} while (empty >= 0 && !empty);
|
||||
|
||||
if (empty < 0)
|
||||
space = isl_space_free(space);
|
||||
|
||||
id = isl_space_get_tuple_id(space, isl_dim_set);
|
||||
|
||||
isl_space_free(space);
|
||||
isl_union_set_free(domain);
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
/* Construct a contraction from "prefix" and "domain" for a new group
|
||||
* in "grouping".
|
||||
*
|
||||
* The values of the prefix schedule "prefix" are used as instances
|
||||
* of the new group. The identifier of the group is constructed
|
||||
* in such a way that it does not conflict with those of earlier
|
||||
* groups nor with statements in the domain of the original
|
||||
* schedule constraints.
|
||||
* The isl_multi_union_pw_aff "prefix" then simply needs to be
|
||||
* converted to an isl_union_pw_multi_aff. However, this is not
|
||||
* possible if "prefix" is zero-dimensional, so in this case,
|
||||
* a contraction is constructed from "domain" instead.
|
||||
*/
|
||||
static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain(
|
||||
struct ppcg_grouping *grouping,
|
||||
__isl_keep isl_multi_union_pw_aff *prefix,
|
||||
__isl_keep isl_union_set *domain)
|
||||
{
|
||||
isl_id *id;
|
||||
isl_space *space;
|
||||
int dim;
|
||||
|
||||
space = isl_multi_union_pw_aff_get_space(prefix);
|
||||
if (!space)
|
||||
return NULL;
|
||||
dim = isl_space_dim(space, isl_dim_set);
|
||||
id = construct_group_id(grouping, space);
|
||||
if (dim == 0) {
|
||||
isl_multi_val *mv;
|
||||
|
||||
space = isl_multi_union_pw_aff_get_space(prefix);
|
||||
space = isl_space_set_tuple_id(space, isl_dim_set, id);
|
||||
mv = isl_multi_val_zero(space);
|
||||
domain = isl_union_set_copy(domain);
|
||||
return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv);
|
||||
}
|
||||
prefix = isl_multi_union_pw_aff_copy(prefix);
|
||||
prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id);
|
||||
return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix);
|
||||
}
|
||||
|
||||
/* Extend "grouping" with groups corresponding to merged
|
||||
* leaves in the list of potentially merged leaves "leaves".
|
||||
*
|
||||
* The "list" field of each element in "leaves" contains a list
|
||||
* of the instances sets of the original leaves that have been
|
||||
* merged into this element. If at least two of the original leaves
|
||||
* have been merged into a given element, then add the corresponding
|
||||
* group to "grouping".
|
||||
* In particular, the domain is extended with the statement instances
|
||||
* of the merged leaves, the contraction is extended with a mapping
|
||||
* of these statement instances to instances of a new group and
|
||||
* the schedule is extended with a schedule that executes
|
||||
* the statement instances according to the order of the leaves
|
||||
* in which they appear.
|
||||
* Since the instances of the groups should already be scheduled apart
|
||||
* in the schedule into which this schedule will be plugged in,
|
||||
* the schedules of the individual groups are combined independently
|
||||
* of each other (as a set).
|
||||
*/
|
||||
static isl_stat add_groups(struct ppcg_grouping *grouping,
|
||||
int n, struct ppcg_grouping_leaf leaves[])
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < n; ++i) {
|
||||
int n_leaf;
|
||||
isl_schedule *schedule;
|
||||
isl_union_set *domain;
|
||||
isl_union_pw_multi_aff *upma;
|
||||
|
||||
n_leaf = isl_union_set_list_n_union_set(leaves[i].list);
|
||||
if (n_leaf < 0)
|
||||
return isl_stat_error;
|
||||
if (n_leaf <= 1)
|
||||
continue;
|
||||
schedule = schedule_from_domain_and_list(leaves[i].domain,
|
||||
leaves[i].list);
|
||||
upma = group_contraction_from_prefix_and_domain(grouping,
|
||||
leaves[i].prefix, leaves[i].domain);
|
||||
|
||||
domain = isl_union_set_copy(leaves[i].domain);
|
||||
if (grouping->domain) {
|
||||
domain = isl_union_set_union(domain, grouping->domain);
|
||||
upma = isl_union_pw_multi_aff_union_add(upma,
|
||||
grouping->contraction);
|
||||
schedule = isl_schedule_set(schedule,
|
||||
grouping->schedule);
|
||||
}
|
||||
grouping->domain = domain;
|
||||
grouping->contraction = upma;
|
||||
grouping->schedule = schedule;
|
||||
|
||||
if (!grouping->domain || !grouping->contraction ||
|
||||
!grouping->schedule)
|
||||
return isl_stat_error;
|
||||
}
|
||||
|
||||
return isl_stat_ok;
|
||||
}
|
||||
|
||||
/* Look for any pairs of consecutive leaves among the "n" children of "node"
|
||||
* starting at "first" that should be merged together.
|
||||
* Store the results in "grouping".
|
||||
*
|
||||
* First make sure the intersection of validity and proximity
|
||||
* schedule constraints is available and extract the required
|
||||
* information from the "n" leaves.
|
||||
* Then try and merge consecutive leaves based on the validity
|
||||
* and proximity constraints.
|
||||
* If any pairs were successfully merged, then add groups
|
||||
* corresponding to the merged leaves to "grouping".
|
||||
*/
|
||||
static isl_stat group_subsequence(__isl_keep isl_schedule_node *node,
|
||||
int first, int n, struct ppcg_grouping *grouping)
|
||||
{
|
||||
int n_merge;
|
||||
struct ppcg_grouping_leaf *leaves;
|
||||
|
||||
if (ppcg_grouping_compute_dep(grouping) < 0)
|
||||
return isl_stat_error;
|
||||
|
||||
leaves = extract_leaves(node, first, n);
|
||||
if (!leaves)
|
||||
return isl_stat_error;
|
||||
|
||||
n_merge = merge_leaves(n, leaves, grouping->dep);
|
||||
if (n_merge >= 0 && n_merge < n &&
|
||||
add_groups(grouping, n_merge, leaves) < 0)
|
||||
return isl_stat_error;
|
||||
|
||||
ppcg_grouping_leaf_free(n, leaves);
|
||||
|
||||
return isl_stat_ok;
|
||||
}
|
||||
|
||||
/* If "node" is a sequence, then check if it has any consecutive
|
||||
* leaves that should be merged together and store the results
|
||||
* in "grouping".
|
||||
*
|
||||
* In particular, call group_subsequence on each consecutive
|
||||
* sequence of (filtered) leaves among the children of "node".
|
||||
*/
|
||||
static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user)
|
||||
{
|
||||
int i, n, first;
|
||||
struct ppcg_grouping *grouping = user;
|
||||
|
||||
if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
|
||||
return isl_bool_true;
|
||||
|
||||
n = isl_schedule_node_n_children(node);
|
||||
if (n < 0)
|
||||
return isl_bool_error;
|
||||
|
||||
first = -1;
|
||||
for (i = 0; i < n; ++i) {
|
||||
isl_schedule_node *child;
|
||||
enum isl_schedule_node_type type;
|
||||
|
||||
child = isl_schedule_node_get_child(node, i);
|
||||
child = isl_schedule_node_child(child, 0);
|
||||
type = isl_schedule_node_get_type(child);
|
||||
isl_schedule_node_free(child);
|
||||
|
||||
if (first >= 0 && type != isl_schedule_node_leaf) {
|
||||
if (group_subsequence(node, first, i - first,
|
||||
grouping) < 0)
|
||||
return isl_bool_error;
|
||||
first = -1;
|
||||
}
|
||||
if (first < 0 && type == isl_schedule_node_leaf)
|
||||
first = i;
|
||||
}
|
||||
if (first >= 0) {
|
||||
if (group_subsequence(node, first, n - first, grouping) < 0)
|
||||
return isl_bool_error;
|
||||
}
|
||||
|
||||
return isl_bool_true;
|
||||
}
|
||||
|
||||
/* Complete "grouping" to cover all statement instances in the domain
|
||||
* of grouping->sc.
|
||||
*
|
||||
* In particular, grouping->domain is set to the full set of statement
|
||||
* instances; group->contraction is extended with an identity
|
||||
* contraction on the additional instances and group->schedule
|
||||
* is extended with an independent schedule on those additional instances.
|
||||
* In the extension of group->contraction, the additional instances
|
||||
* are split into those belong to different statements and those
|
||||
* that belong to some of the same statements. The first group
|
||||
* is replaced by its universe in order to simplify the contraction extension.
|
||||
*/
|
||||
static void complete_grouping(struct ppcg_grouping *grouping)
|
||||
{
|
||||
isl_union_set *domain, *left, *overlap;
|
||||
isl_union_pw_multi_aff *upma;
|
||||
isl_schedule *schedule;
|
||||
|
||||
domain = isl_schedule_constraints_get_domain(grouping->sc);
|
||||
left = isl_union_set_subtract(isl_union_set_copy(domain),
|
||||
isl_union_set_copy(grouping->domain));
|
||||
schedule = isl_schedule_from_domain(isl_union_set_copy(left));
|
||||
schedule = isl_schedule_set(schedule, grouping->schedule);
|
||||
grouping->schedule = schedule;
|
||||
|
||||
overlap = isl_union_set_universe(grouping->domain);
|
||||
grouping->domain = domain;
|
||||
overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap);
|
||||
left = isl_union_set_subtract(left, isl_union_set_copy(overlap));
|
||||
left = isl_union_set_universe(left);
|
||||
left = isl_union_set_union(left, overlap);
|
||||
upma = isl_union_set_identity_union_pw_multi_aff(left);
|
||||
upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction);
|
||||
grouping->contraction = upma;
|
||||
}
|
||||
|
||||
/* Compute a schedule on the domain of "sc" that respects the schedule
|
||||
* constraints in "sc".
|
||||
*
|
||||
* "schedule" is a known correct schedule that is used to combine
|
||||
* groups of statements if options->group_chains is set.
|
||||
* In particular, statements that are executed consecutively in a sequence
|
||||
* in this schedule and where all instances of the second depend on
|
||||
* the instance of the first that is executed in the same iteration
|
||||
* of outer band nodes are grouped together into a single statement.
|
||||
* The schedule constraints are then mapped to these groups of statements
|
||||
* and the resulting schedule is expanded again to refer to the original
|
||||
* statements.
|
||||
*/
|
||||
__isl_give isl_schedule *ppcg_compute_schedule(
|
||||
__isl_take isl_schedule_constraints *sc,
|
||||
__isl_keep isl_schedule *schedule, struct ppcg_options *options)
|
||||
{
|
||||
struct ppcg_grouping grouping = { sc };
|
||||
isl_union_pw_multi_aff *contraction;
|
||||
isl_union_map *umap;
|
||||
isl_schedule *res, *expansion;
|
||||
|
||||
if (!options->group_chains)
|
||||
return isl_schedule_constraints_compute_schedule(sc);
|
||||
|
||||
grouping.group_id = 0;
|
||||
if (isl_schedule_foreach_schedule_node_top_down(schedule,
|
||||
&detect_groups, &grouping) < 0)
|
||||
goto error;
|
||||
if (!grouping.contraction) {
|
||||
ppcg_grouping_clear(&grouping);
|
||||
return isl_schedule_constraints_compute_schedule(sc);
|
||||
}
|
||||
complete_grouping(&grouping);
|
||||
contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
|
||||
umap = isl_union_map_from_union_pw_multi_aff(contraction);
|
||||
|
||||
sc = isl_schedule_constraints_apply(sc, umap);
|
||||
|
||||
res = isl_schedule_constraints_compute_schedule(sc);
|
||||
|
||||
contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
|
||||
expansion = isl_schedule_copy(grouping.schedule);
|
||||
res = isl_schedule_expand(res, contraction, expansion);
|
||||
|
||||
ppcg_grouping_clear(&grouping);
|
||||
return res;
|
||||
error:
|
||||
ppcg_grouping_clear(&grouping);
|
||||
isl_schedule_constraints_free(sc);
|
||||
return NULL;
|
||||
}
|
||||
2242
polly/lib/External/ppcg/hybrid.c
vendored
2242
polly/lib/External/ppcg/hybrid.c
vendored
File diff suppressed because it is too large
Load Diff
41
polly/lib/External/ppcg/hybrid.h
vendored
41
polly/lib/External/ppcg/hybrid.h
vendored
@ -1,41 +0,0 @@
|
||||
#ifndef HYBRID_H
|
||||
#define HYBRID_H
|
||||
|
||||
#include <isl/val.h>
|
||||
#include <isl/schedule_node.h>
|
||||
|
||||
#include "ppcg.h"
|
||||
|
||||
struct ppcg_ht_bounds;
|
||||
typedef struct ppcg_ht_bounds ppcg_ht_bounds;
|
||||
|
||||
struct ppcg_ht_phase;
|
||||
typedef struct ppcg_ht_phase ppcg_ht_phase;
|
||||
|
||||
isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node);
|
||||
isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node);
|
||||
|
||||
__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
|
||||
__isl_keep isl_schedule_node *node);
|
||||
void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds);
|
||||
isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds);
|
||||
isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
|
||||
__isl_keep isl_multi_val *sizes);
|
||||
__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
|
||||
__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
|
||||
__isl_take isl_schedule_node *node, struct ppcg_options *options);
|
||||
__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
|
||||
__isl_take ppcg_ht_bounds *bounds);
|
||||
|
||||
__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
|
||||
__isl_keep isl_schedule_node *node);
|
||||
__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
|
||||
__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node);
|
||||
__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
|
||||
__isl_take isl_schedule_node *node,
|
||||
__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
|
||||
void *user), void *user);
|
||||
__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
|
||||
__isl_take isl_schedule_node *node);
|
||||
|
||||
#endif
|
||||
174
polly/lib/External/ppcg/ocl_utilities.c
vendored
174
polly/lib/External/ppcg/ocl_utilities.c
vendored
@ -1,174 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "ocl_utilities.h"
|
||||
|
||||
/* Return the OpenCL error string for a given error number.
|
||||
*/
|
||||
const char *opencl_error_string(cl_int error)
|
||||
{
|
||||
int errorCount;
|
||||
int index;
|
||||
|
||||
static const char *errorString[] = {
|
||||
[CL_SUCCESS] = "CL_SUCCESS",
|
||||
[-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
|
||||
[-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
|
||||
[-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
|
||||
[-CL_MEM_OBJECT_ALLOCATION_FAILURE] =
|
||||
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
|
||||
[-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
|
||||
[-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
|
||||
[-CL_PROFILING_INFO_NOT_AVAILABLE] =
|
||||
"CL_PROFILING_INFO_NOT_AVAILABLE",
|
||||
[-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
|
||||
[-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
|
||||
[-CL_IMAGE_FORMAT_NOT_SUPPORTED] =
|
||||
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
|
||||
[-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
|
||||
[-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
|
||||
[-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
|
||||
[-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
|
||||
[-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
|
||||
[-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
|
||||
[-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
|
||||
[-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
|
||||
[-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
|
||||
[-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
|
||||
[-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT",
|
||||
[-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] =
|
||||
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
|
||||
[-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
|
||||
[-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
|
||||
[-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
|
||||
[-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
|
||||
[-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
|
||||
[-CL_INVALID_PROGRAM_EXECUTABLE] =
|
||||
"CL_INVALID_PROGRAM_EXECUTABLE",
|
||||
[-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
|
||||
[-CL_INVALID_KERNEL_DEFINITION] =
|
||||
"CL_INVALID_KERNEL_DEFINITION",
|
||||
[-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
|
||||
[-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
|
||||
[-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
|
||||
[-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
|
||||
[-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
|
||||
[-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
|
||||
[-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
|
||||
[-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
|
||||
[-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
|
||||
[-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
|
||||
[-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
|
||||
[-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
|
||||
[-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
|
||||
[-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
|
||||
[-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
|
||||
[-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
|
||||
[-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
|
||||
};
|
||||
|
||||
errorCount = sizeof(errorString) / sizeof(errorString[0]);
|
||||
index = -error;
|
||||
|
||||
return (index >= 0 && index < errorCount) ?
|
||||
errorString[index] : "Unspecified Error";
|
||||
}
|
||||
|
||||
/* Find a GPU or a CPU associated with the first available platform.
|
||||
* If use_gpu is set, then this function first tries to look for a GPU
|
||||
* in the first available platform.
|
||||
* If this fails or if use_gpu is not set, then it tries to use the CPU.
|
||||
*/
|
||||
cl_device_id opencl_create_device(int use_gpu)
|
||||
{
|
||||
cl_platform_id platform;
|
||||
cl_device_id dev;
|
||||
int err;
|
||||
|
||||
err = clGetPlatformIDs(1, &platform, NULL);
|
||||
if (err < 0) {
|
||||
fprintf(stderr, "Error %s while looking for a platform.\n",
|
||||
opencl_error_string(err));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
err = CL_DEVICE_NOT_FOUND;
|
||||
if (use_gpu)
|
||||
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev,
|
||||
NULL);
|
||||
if (err == CL_DEVICE_NOT_FOUND)
|
||||
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev,
|
||||
NULL);
|
||||
if (err < 0) {
|
||||
fprintf(stderr, "Error %s while looking for a device.\n",
|
||||
opencl_error_string(err));
|
||||
exit(1);
|
||||
}
|
||||
return dev;
|
||||
}
|
||||
|
||||
/* Create an OpenCL program from a string and compile it.
|
||||
*/
|
||||
cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
|
||||
const char *program_source, size_t program_size,
|
||||
const char *opencl_options)
|
||||
{
|
||||
int err;
|
||||
cl_program program;
|
||||
char *program_log;
|
||||
size_t log_size;
|
||||
|
||||
program = clCreateProgramWithSource(ctx, 1,
|
||||
&program_source, &program_size, &err);
|
||||
if (err < 0) {
|
||||
fprintf(stderr, "Could not create the program\n");
|
||||
exit(1);
|
||||
}
|
||||
err = clBuildProgram(program, 0, NULL, opencl_options, NULL, NULL);
|
||||
if (err < 0) {
|
||||
fprintf(stderr, "Could not build the program.\n");
|
||||
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0,
|
||||
NULL, &log_size);
|
||||
program_log = (char *) malloc(log_size + 1);
|
||||
program_log[log_size] = '\0';
|
||||
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
|
||||
log_size + 1, program_log, NULL);
|
||||
fprintf(stderr, "%s\n", program_log);
|
||||
free(program_log);
|
||||
exit(1);
|
||||
}
|
||||
return program;
|
||||
}
|
||||
|
||||
/* Create an OpenCL program from a source file and compile it.
|
||||
*/
|
||||
cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
|
||||
const char* filename, const char* opencl_options)
|
||||
{
|
||||
cl_program program;
|
||||
FILE *program_file;
|
||||
char *program_source;
|
||||
size_t program_size, read;
|
||||
|
||||
program_file = fopen(filename, "r");
|
||||
if (program_file == NULL) {
|
||||
fprintf(stderr, "Could not find the source file.\n");
|
||||
exit(1);
|
||||
}
|
||||
fseek(program_file, 0, SEEK_END);
|
||||
program_size = ftell(program_file);
|
||||
rewind(program_file);
|
||||
program_source = (char *) malloc(program_size + 1);
|
||||
program_source[program_size] = '\0';
|
||||
read = fread(program_source, sizeof(char), program_size, program_file);
|
||||
if (read != program_size) {
|
||||
fprintf(stderr, "Error while reading the kernel.\n");
|
||||
exit(1);
|
||||
}
|
||||
fclose(program_file);
|
||||
|
||||
program = opencl_build_program_from_string(ctx, dev, program_source,
|
||||
program_size, opencl_options);
|
||||
free(program_source);
|
||||
|
||||
return program;
|
||||
}
|
||||
32
polly/lib/External/ppcg/ocl_utilities.h
vendored
32
polly/lib/External/ppcg/ocl_utilities.h
vendored
@ -1,32 +0,0 @@
|
||||
#ifndef OCL_UTILITIES_H
|
||||
#define OCL_UTILITIES_H
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
/* Return the OpenCL error string for a given error number.
|
||||
*/
|
||||
const char *opencl_error_string(cl_int error);
|
||||
|
||||
/* Find a GPU or a CPU associated with the first available platform.
|
||||
* If use_gpu is set, then this function first tries to look for a GPU
|
||||
* in the first available platform.
|
||||
* If this fails or if use_gpu is not set, then it tries to use the CPU.
|
||||
*/
|
||||
cl_device_id opencl_create_device(int use_gpu);
|
||||
|
||||
/* Create an OpenCL program from a string and compile it.
|
||||
*/
|
||||
cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
|
||||
const char *program_source, size_t program_size,
|
||||
const char *opencl_options);
|
||||
|
||||
/* Create an OpenCL program from a source file and compile it.
|
||||
*/
|
||||
cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
|
||||
const char* filename, const char* opencl_options);
|
||||
|
||||
#endif
|
||||
11
polly/lib/External/ppcg/opencl.h
vendored
11
polly/lib/External/ppcg/opencl.h
vendored
@ -1,11 +0,0 @@
|
||||
#ifndef _OPENCL_H
|
||||
#define _OPENCL_H
|
||||
|
||||
#include <pet.h>
|
||||
#include "ppcg_options.h"
|
||||
#include "ppcg.h"
|
||||
|
||||
int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
|
||||
const char *input, const char *output);
|
||||
|
||||
#endif
|
||||
78
polly/lib/External/ppcg/opencl_test.sh.in
vendored
78
polly/lib/External/ppcg/opencl_test.sh.in
vendored
@ -1,78 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
keep=no
|
||||
|
||||
for option; do
|
||||
case "$option" in
|
||||
--keep)
|
||||
keep=yes
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
EXEEXT=@EXEEXT@
|
||||
VERSION=@GIT_HEAD_VERSION@
|
||||
CC="@CC@"
|
||||
CFLAGS="--std=gnu99"
|
||||
srcdir="@srcdir@"
|
||||
|
||||
if [ $keep = "yes" ]; then
|
||||
OUTDIR="opencl_test.$VERSION"
|
||||
mkdir "$OUTDIR" || exit 1
|
||||
else
|
||||
if test "x$TMPDIR" = "x"; then
|
||||
TMPDIR=/tmp
|
||||
fi
|
||||
OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
|
||||
fi
|
||||
|
||||
run_tests () {
|
||||
subdir=$1
|
||||
ppcg_options=$2
|
||||
|
||||
echo Test with PPCG options \'$ppcg_options\'
|
||||
mkdir ${OUTDIR}/${subdir} || exit 1
|
||||
for i in $srcdir/tests/*.c; do
|
||||
echo $i
|
||||
name=`basename $i`
|
||||
name="${name%.c}"
|
||||
out_c="${OUTDIR}/${subdir}/$name.ppcg.c"
|
||||
out="${OUTDIR}/${subdir}/$name.ppcg$EXEEXT"
|
||||
options="--target=opencl --opencl-no-use-gpu $ppcg_options"
|
||||
functions="$srcdir/tests/${name}_opencl_functions.cl"
|
||||
if test -f $functions; then
|
||||
options="$options --opencl-include-file=$functions"
|
||||
options="$options --opencl-compiler-options=-I."
|
||||
fi
|
||||
./ppcg$EXEEXT $options $i -o "$out_c" || exit
|
||||
$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
|
||||
-I. "$out_c" -o "$out" || exit
|
||||
$out || exit
|
||||
done
|
||||
}
|
||||
|
||||
run_tests default
|
||||
run_tests embed --opencl-embed-kernel-code
|
||||
|
||||
for i in $srcdir/examples/*.c; do
|
||||
echo $i
|
||||
name=`basename $i`
|
||||
name="${name%.c}"
|
||||
exe_ref="${OUTDIR}/$name.ref$EXEEXT"
|
||||
gen_ocl="${OUTDIR}/$name.ppcg.c"
|
||||
exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT"
|
||||
output_ref="${OUTDIR}/$name.ref.out"
|
||||
output_ocl="${OUTDIR}/$name.ppcg.out"
|
||||
$CC $CFLAGS $i -o $exe_ref || exit
|
||||
./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \
|
||||
exit
|
||||
$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
|
||||
"$gen_ocl" -o "$exe_ocl" || exit
|
||||
$exe_ref > $output_ref || exit
|
||||
$exe_ocl > $output_ocl || exit
|
||||
cmp $output_ref $output_ocl || exit
|
||||
done
|
||||
|
||||
if [ $keep = "no" ]; then
|
||||
rm -r "${OUTDIR}"
|
||||
fi
|
||||
109
polly/lib/External/ppcg/polybench_test.sh.in
vendored
109
polly/lib/External/ppcg/polybench_test.sh.in
vendored
@ -1,109 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
keep=no
|
||||
verbose=no
|
||||
|
||||
for option; do
|
||||
case "$option" in
|
||||
--keep)
|
||||
keep=yes
|
||||
;;
|
||||
--verbose)
|
||||
verbose=yes
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
EXEEXT=@EXEEXT@
|
||||
DIR=@POLYBENCH_DIR@
|
||||
VERSION=@GIT_HEAD_VERSION@
|
||||
SIZE=-DMINI_DATASET
|
||||
CC="@CC@"
|
||||
HAVE_OPENCL=@HAVE_OPENCL@
|
||||
HAVE_OPENMP=@HAVE_OPENMP@
|
||||
srcdir="@srcdir@"
|
||||
if [ $keep = "yes" ]; then
|
||||
OUTDIR="out.$VERSION"
|
||||
mkdir "$OUTDIR" || exit 1
|
||||
else
|
||||
if test "x$TMPDIR" = "x"; then
|
||||
TMPDIR=/tmp
|
||||
fi
|
||||
OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
|
||||
fi
|
||||
CPPFLAGS="-DPOLYBENCH_USE_C99_PROTO -DPOLYBENCH_DUMP_ARRAYS"
|
||||
CPPFLAGS="$CPPFLAGS $SIZE -I $DIR/utilities"
|
||||
CFLAGS="-lm --std=gnu99"
|
||||
|
||||
echo "Running tests in folder ${OUTDIR}"
|
||||
|
||||
run_tests () {
|
||||
ext=$1
|
||||
|
||||
ppcg_options=$2
|
||||
cc_options=$3
|
||||
|
||||
if [ "x$ppcg_options" = "x" ]; then
|
||||
ppcg_option_str="none"
|
||||
else
|
||||
ppcg_option_str=$ppcg_options
|
||||
fi
|
||||
|
||||
if [ "x$cc_options" = "x" ]; then
|
||||
cc_option_str="none"
|
||||
else
|
||||
cc_option_str=$cc_options
|
||||
fi
|
||||
|
||||
echo Test: $ext, ppcg options: $ppcg_option_str, CC options: $cc_option_str
|
||||
for i in `cat $DIR/utilities/benchmark_list`; do
|
||||
echo $i
|
||||
name=`basename $i`
|
||||
name=${name%.c}
|
||||
source_opt="${OUTDIR}/$name.$ext.c"
|
||||
prog_orig=${OUTDIR}/$name.orig${EXEEXT}
|
||||
prog_opt=${OUTDIR}/$name.$ext${EXEEXT}
|
||||
output_orig=${OUTDIR}/$name.orig.out
|
||||
output_opt=${OUTDIR}/$name.$ext.out
|
||||
dir=`dirname $i`
|
||||
if [ $verbose = "yes" ]; then
|
||||
echo ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i \
|
||||
$CPPFLAGS -o $source_opt $ppcg_options
|
||||
fi
|
||||
./ppcg$EXEEXT -I $DIR/$dir $DIR/$i $CPPFLAGS \
|
||||
-o $source_opt $ppcg_options || exit
|
||||
$CC -I $DIR/$dir $CPPFLAGS $DIR/$i -o $prog_orig \
|
||||
$DIR/utilities/polybench.c $CFLAGS
|
||||
$prog_orig 2> $output_orig
|
||||
if [ $verbose = "yes" ]; then
|
||||
echo $CC -I $DIR/$dir $CPPFLAGS $source_opt \
|
||||
-o $prog_opt $DIR/utilities/polybench.c \
|
||||
$CFLAGS $cc_options
|
||||
fi
|
||||
$CC -I $DIR/$dir $CPPFLAGS $source_opt -o $prog_opt \
|
||||
$DIR/utilities/polybench.c $CFLAGS $cc_options || exit
|
||||
|
||||
$prog_opt 2> $output_opt
|
||||
cmp $output_orig $output_opt || exit
|
||||
done
|
||||
}
|
||||
|
||||
run_tests ppcg "--target=c --tile"
|
||||
run_tests ppcg_live "--target=c --no-live-range-reordering --tile"
|
||||
|
||||
# Test OpenMP code, if compiler supports openmp
|
||||
if [ $HAVE_OPENMP = "yes" ]; then
|
||||
run_tests ppcg_omp "--target=c --openmp" -fopenmp
|
||||
echo Introduced `grep -R 'omp parallel' "${OUTDIR}" | wc -l` '"pragma omp parallel for"'
|
||||
else
|
||||
echo Compiler does not support OpenMP. Skipping OpenMP tests.
|
||||
fi
|
||||
|
||||
if [ $HAVE_OPENCL = "yes" ]; then
|
||||
run_tests ppcg_opencl "--target=opencl --opencl-no-use-gpu" \
|
||||
"-I $srcdir $srcdir/ocl_utilities.c -lOpenCL"
|
||||
fi
|
||||
|
||||
if [ $keep = "no" ]; then
|
||||
rm -r "${OUTDIR}"
|
||||
fi
|
||||
1067
polly/lib/External/ppcg/ppcg.c
vendored
1067
polly/lib/External/ppcg/ppcg.c
vendored
File diff suppressed because it is too large
Load Diff
128
polly/lib/External/ppcg/ppcg.h
vendored
128
polly/lib/External/ppcg/ppcg.h
vendored
@ -1,128 +0,0 @@
|
||||
#ifndef PPCG_H
|
||||
#define PPCG_H
|
||||
|
||||
#include <isl/schedule.h>
|
||||
#include <isl/set.h>
|
||||
#include <isl/union_set.h>
|
||||
#include <isl/union_map.h>
|
||||
#include <isl/id_to_ast_expr.h>
|
||||
#include <pet.h>
|
||||
|
||||
#include "ppcg_options.h"
|
||||
|
||||
const char *ppcg_base_name(const char *filename);
|
||||
int ppcg_extract_base_name(char *name, const char *input);
|
||||
|
||||
/* Representation of the scop for use inside PPCG.
|
||||
*
|
||||
* "options" are the options specified by the user.
|
||||
* Some fields in this structure may depend on some of the options.
|
||||
*
|
||||
* "start" and "end" are file offsets of the corresponding program text.
|
||||
* "context" represents constraints on the parameters.
|
||||
* "domain" is the union of all iteration domains.
|
||||
* "call" contains the iteration domains of statements with a call expression.
|
||||
* "reads" contains all potential read accesses.
|
||||
* "tagged_reads" is the same as "reads", except that the domain is a wrapped
|
||||
* relation mapping an iteration domain to a reference identifier
|
||||
* "live_in" contains the potential read accesses that potentially
|
||||
* have no corresponding writes in the scop.
|
||||
* "may_writes" contains all potential write accesses.
|
||||
* "tagged_may_writes" is the same as "may_writes", except that the domain
|
||||
* is a wrapped relation mapping an iteration domain
|
||||
* to a reference identifier
|
||||
* "must_writes" contains all definite write accesses.
|
||||
* "tagged_must_writes" is the same as "must_writes", except that the domain
|
||||
* is a wrapped relation mapping an iteration domain
|
||||
* to a reference identifier
|
||||
* "live_out" contains the potential write accesses that are potentially
|
||||
* not killed by any kills or any other writes.
|
||||
* "must_kills" contains all definite kill accesses.
|
||||
* "tagged_must_kills" is the same as "must_kills", except that the domain
|
||||
* is a wrapped relation mapping an iteration domain
|
||||
* to a reference identifier.
|
||||
*
|
||||
* "tagger" maps tagged iteration domains to the corresponding untagged
|
||||
* iteration domain.
|
||||
*
|
||||
* "independence" is the union of all independence filters.
|
||||
*
|
||||
* "dep_flow" represents the potential flow dependences.
|
||||
* "tagged_dep_flow" is the same as "dep_flow", except that both domain and
|
||||
* range are wrapped relations mapping an iteration domain to
|
||||
* a reference identifier. May be NULL if not computed.
|
||||
* "dep_false" represents the potential false (anti and output) dependences.
|
||||
* "dep_forced" represents the validity constraints that should be enforced
|
||||
* even when live-range reordering is used.
|
||||
* In particular, these constraints ensure that all live-in
|
||||
* accesses remain live-in and that all live-out accesses remain live-out
|
||||
* and that multiple potential sources for the same read are
|
||||
* executed in the original order.
|
||||
* "dep_order"/"tagged_dep_order" represents the order dependences between
|
||||
* the live range intervals in "dep_flow"/"tagged_dep_flow".
|
||||
* It is only used if the live_range_reordering
|
||||
* option is set. Otherwise it is NULL.
|
||||
* If "dep_order" is used, then "dep_false" only contains a limited
|
||||
* set of anti and output dependences.
|
||||
* "schedule" represents the (original) schedule.
|
||||
*
|
||||
* "names" contains all variable names that are in use by the scop.
|
||||
* The names are mapped to a dummy value.
|
||||
*
|
||||
* "pet" is the original pet_scop.
|
||||
*/
|
||||
struct ppcg_scop {
|
||||
struct ppcg_options *options;
|
||||
|
||||
unsigned start;
|
||||
unsigned end;
|
||||
|
||||
isl_set *context;
|
||||
isl_union_set *domain;
|
||||
isl_union_set *call;
|
||||
isl_union_map *tagged_reads;
|
||||
isl_union_map *reads;
|
||||
isl_union_map *live_in;
|
||||
isl_union_map *tagged_may_writes;
|
||||
isl_union_map *may_writes;
|
||||
isl_union_map *tagged_must_writes;
|
||||
isl_union_map *must_writes;
|
||||
isl_union_map *live_out;
|
||||
isl_union_map *tagged_must_kills;
|
||||
isl_union_map *must_kills;
|
||||
|
||||
isl_union_pw_multi_aff *tagger;
|
||||
|
||||
isl_union_map *independence;
|
||||
|
||||
isl_union_map *dep_flow;
|
||||
isl_union_map *tagged_dep_flow;
|
||||
isl_union_map *dep_false;
|
||||
isl_union_map *dep_forced;
|
||||
isl_union_map *dep_order;
|
||||
isl_union_map *tagged_dep_order;
|
||||
isl_schedule *schedule;
|
||||
|
||||
isl_id_to_ast_expr *names;
|
||||
|
||||
struct pet_scop *pet;
|
||||
};
|
||||
|
||||
int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop);
|
||||
__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop,
|
||||
int n, const char *prefix);
|
||||
|
||||
int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out,
|
||||
struct ppcg_options *options,
|
||||
__isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
|
||||
struct ppcg_scop *scop, void *user), void *user);
|
||||
|
||||
__isl_give isl_schedule *ppcg_compute_schedule(
|
||||
__isl_take isl_schedule_constraints *sc,
|
||||
__isl_keep isl_schedule *schedule, struct ppcg_options *options);
|
||||
|
||||
void compute_tagger(struct ppcg_scop *ps);
|
||||
void compute_dependences(struct ppcg_scop *scop);
|
||||
void eliminate_dead_code(struct ppcg_scop *ps);
|
||||
void *ppcg_scop_free(struct ppcg_scop *ps);
|
||||
#endif
|
||||
136
polly/lib/External/ppcg/ppcg_options.c
vendored
136
polly/lib/External/ppcg/ppcg_options.c
vendored
@ -1,136 +0,0 @@
|
||||
/*
|
||||
* Copyright 2010-2011 INRIA Saclay
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
|
||||
* Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
|
||||
* 91893 Orsay, France
|
||||
*/
|
||||
|
||||
#include "ppcg_options.h"
|
||||
|
||||
static struct isl_arg_choice target[] = {
|
||||
{"c", PPCG_TARGET_C},
|
||||
{"cuda", PPCG_TARGET_CUDA},
|
||||
{"opencl", PPCG_TARGET_OPENCL},
|
||||
{0}
|
||||
};
|
||||
|
||||
/* Set defaults that depend on the target.
|
||||
* In particular, set --schedule-outer-coincidence iff target is a GPU.
|
||||
*/
|
||||
void ppcg_options_set_target_defaults(struct ppcg_options *options)
|
||||
{
|
||||
char *argv[2] = { NULL };
|
||||
|
||||
argv[0] = "ppcg_options_set_target_defaults";
|
||||
if (options->target == PPCG_TARGET_C)
|
||||
argv[1] = "--no-schedule-outer-coincidence";
|
||||
else
|
||||
argv[1] = "--schedule-outer-coincidence";
|
||||
|
||||
isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL);
|
||||
}
|
||||
|
||||
/* Callback that is called whenever the "target" option is set (to "val").
|
||||
* The callback is called after target has been updated.
|
||||
*
|
||||
* Call ppcg_options_set_target_defaults to reset the target-dependent options.
|
||||
*/
|
||||
static int set_target(void *opt, unsigned val)
|
||||
{
|
||||
struct ppcg_options *options = opt;
|
||||
|
||||
ppcg_options_set_target_defaults(options);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args)
|
||||
ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0,
|
||||
"dump-schedule-constraints", 0, "dump schedule constraints")
|
||||
ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule, 0,
|
||||
"dump-schedule", 0, "dump isl computed schedule")
|
||||
ISL_ARG_BOOL(struct ppcg_debug_options, dump_final_schedule, 0,
|
||||
"dump-final-schedule", 0, "dump PPCG computed schedule")
|
||||
ISL_ARG_BOOL(struct ppcg_debug_options, dump_sizes, 0,
|
||||
"dump-sizes", 0,
|
||||
"dump effectively used per kernel tile, grid and block sizes")
|
||||
ISL_ARG_BOOL(struct ppcg_debug_options, verbose, 'v', "verbose", 0, NULL)
|
||||
ISL_ARGS_END
|
||||
|
||||
ISL_ARGS_START(struct ppcg_options, ppcg_opencl_options_args)
|
||||
ISL_ARG_STR(struct ppcg_options, opencl_compiler_options, 0, "compiler-options",
|
||||
"options", NULL, "options to pass to the OpenCL compiler")
|
||||
ISL_ARG_BOOL(struct ppcg_options, opencl_use_gpu, 0, "use-gpu", 1,
|
||||
"use GPU device (if available)")
|
||||
ISL_ARG_STR_LIST(struct ppcg_options, opencl_n_include_file,
|
||||
opencl_include_files, 0, "include-file", "filename",
|
||||
"file to #include in generated OpenCL code")
|
||||
ISL_ARG_BOOL(struct ppcg_options, opencl_print_kernel_types, 0,
|
||||
"print-kernel-types", 1,
|
||||
"print definitions of types in the kernel file")
|
||||
ISL_ARG_BOOL(struct ppcg_options, opencl_embed_kernel_code, 0,
|
||||
"embed-kernel-code", 0, "embed kernel code into host code")
|
||||
ISL_ARGS_END
|
||||
|
||||
ISL_ARGS_START(struct ppcg_options, ppcg_options_args)
|
||||
ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options")
|
||||
ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args,
|
||||
"debugging options")
|
||||
ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1,
|
||||
"group chains of interdependent statements that are executed "
|
||||
"consecutively in the original schedule before scheduling")
|
||||
ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1,
|
||||
"replace original schedule by isl computed schedule")
|
||||
ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0,
|
||||
"scale-tile-loops", 1, NULL)
|
||||
ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL)
|
||||
ISL_ARG_BOOL(struct ppcg_options, use_shared_memory, 0, "shared-memory", 1,
|
||||
"use shared memory in kernel code")
|
||||
ISL_ARG_BOOL(struct ppcg_options, use_private_memory, 0, "private-memory", 1,
|
||||
"use private memory in kernel code")
|
||||
ISL_ARG_STR(struct ppcg_options, ctx, 0, "ctx", "context", NULL,
|
||||
"Constraints on parameters")
|
||||
ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0,
|
||||
"assume-non-negative-parameters", 0,
|
||||
"assume all parameters are non-negative)")
|
||||
ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0,
|
||||
"perform tiling (C target)")
|
||||
ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL)
|
||||
ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles",
|
||||
0, "isolate full tiles from partial tiles (hybrid tiling)")
|
||||
ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL,
|
||||
"Per kernel tile, grid and block sizes")
|
||||
ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0,
|
||||
"max-shared-memory", "size", 8192, "maximal amount of shared memory")
|
||||
ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0,
|
||||
"Generate OpenMP macros (only for C target)")
|
||||
ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target,
|
||||
&set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA,
|
||||
"the target to generate code for")
|
||||
ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0,
|
||||
"linearize-device-arrays", 1,
|
||||
"linearize all device arrays, even those of fixed size")
|
||||
ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0,
|
||||
"allow-gnu-extensions", 1,
|
||||
"allow the use of GNU extensions in generated code")
|
||||
ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0,
|
||||
"live-range-reordering", 1,
|
||||
"allow successive live ranges on the same memory element "
|
||||
"to be reordered")
|
||||
ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0,
|
||||
"apply hybrid tiling whenever a suitable input pattern is found "
|
||||
"(GPU targets)")
|
||||
ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared",
|
||||
0, "unroll code for copying to/from shared memory")
|
||||
ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0,
|
||||
"unroll code inside tile on GPU targets")
|
||||
ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options")
|
||||
ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule",
|
||||
"file", NULL, "save isl computed schedule to <file>")
|
||||
ISL_ARG_STR(struct ppcg_options, load_schedule_file, 0, "load-schedule",
|
||||
"file", NULL, "load schedule from <file>, "
|
||||
"using it instead of an isl computed schedule")
|
||||
ISL_ARGS_END
|
||||
100
polly/lib/External/ppcg/ppcg_options.h
vendored
100
polly/lib/External/ppcg/ppcg_options.h
vendored
@ -1,100 +0,0 @@
|
||||
#ifndef PPCG_OPTIONS_H
|
||||
#define PPCG_OPTIONS_H
|
||||
|
||||
#include <isl/arg.h>
|
||||
#include <isl/options.h>
|
||||
|
||||
struct ppcg_debug_options {
|
||||
int dump_schedule_constraints;
|
||||
int dump_schedule;
|
||||
int dump_final_schedule;
|
||||
int dump_sizes;
|
||||
int verbose;
|
||||
};
|
||||
|
||||
struct ppcg_options {
|
||||
struct isl_options *isl;
|
||||
struct ppcg_debug_options *debug;
|
||||
|
||||
/* Group chains of consecutive statements before scheduling. */
|
||||
int group_chains;
|
||||
|
||||
/* Use isl to compute a schedule replacing the original schedule. */
|
||||
int reschedule;
|
||||
int scale_tile_loops;
|
||||
int wrap;
|
||||
|
||||
/* Assume all parameters are non-negative. */
|
||||
int non_negative_parameters;
|
||||
char *ctx;
|
||||
char *sizes;
|
||||
|
||||
/* Perform tiling (C target). */
|
||||
int tile;
|
||||
int tile_size;
|
||||
|
||||
/* Isolate full tiles from partial tiles. */
|
||||
int isolate_full_tiles;
|
||||
|
||||
/* Take advantage of private memory. */
|
||||
int use_private_memory;
|
||||
|
||||
/* Take advantage of shared memory. */
|
||||
int use_shared_memory;
|
||||
|
||||
/* Maximal amount of shared memory. */
|
||||
int max_shared_memory;
|
||||
|
||||
/* The target we generate code for. */
|
||||
int target;
|
||||
|
||||
/* Generate OpenMP macros (C target only). */
|
||||
int openmp;
|
||||
|
||||
/* Linearize all device arrays. */
|
||||
int linearize_device_arrays;
|
||||
|
||||
/* Allow the use of GNU extensions in generated code. */
|
||||
int allow_gnu_extensions;
|
||||
|
||||
/* Allow live range to be reordered. */
|
||||
int live_range_reordering;
|
||||
|
||||
/* Allow hybrid tiling whenever a suitable input pattern is found. */
|
||||
int hybrid;
|
||||
|
||||
/* Unroll the code for copying to/from shared memory. */
|
||||
int unroll_copy_shared;
|
||||
/* Unroll code inside tile on GPU targets. */
|
||||
int unroll_gpu_tile;
|
||||
|
||||
/* Options to pass to the OpenCL compiler. */
|
||||
char *opencl_compiler_options;
|
||||
/* Prefer GPU device over CPU. */
|
||||
int opencl_use_gpu;
|
||||
/* Number of files to include. */
|
||||
int opencl_n_include_file;
|
||||
/* Files to include. */
|
||||
const char **opencl_include_files;
|
||||
/* Print definitions of types in kernels. */
|
||||
int opencl_print_kernel_types;
|
||||
/* Embed OpenCL kernel code in host code. */
|
||||
int opencl_embed_kernel_code;
|
||||
|
||||
/* Name of file for saving isl computed schedule or NULL. */
|
||||
char *save_schedule_file;
|
||||
/* Name of file for loading schedule or NULL. */
|
||||
char *load_schedule_file;
|
||||
};
|
||||
|
||||
ISL_ARG_DECL(ppcg_debug_options, struct ppcg_debug_options,
|
||||
ppcg_debug_options_args)
|
||||
ISL_ARG_DECL(ppcg_options, struct ppcg_options, ppcg_options_args)
|
||||
|
||||
#define PPCG_TARGET_C 0
|
||||
#define PPCG_TARGET_CUDA 1
|
||||
#define PPCG_TARGET_OPENCL 2
|
||||
|
||||
void ppcg_options_set_target_defaults(struct ppcg_options *options);
|
||||
|
||||
#endif
|
||||
461
polly/lib/External/ppcg/print.c
vendored
461
polly/lib/External/ppcg/print.c
vendored
@ -1,461 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012-2013 Ecole Normale Superieure
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege,
|
||||
* Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
|
||||
*/
|
||||
|
||||
#include <isl/aff.h>
|
||||
#include <isl/ast_build.h>
|
||||
#include <isl/id.h>
|
||||
|
||||
#include "print.h"
|
||||
#include "util.h"
|
||||
|
||||
__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p)
|
||||
{
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "{");
|
||||
p = isl_printer_end_line(p);
|
||||
p = isl_printer_indent(p, 2);
|
||||
return p;
|
||||
}
|
||||
|
||||
__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p)
|
||||
{
|
||||
p = isl_printer_indent(p, -2);
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "}");
|
||||
p = isl_printer_end_line(p);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Names of notes that keep track of whether min/max
|
||||
* macro definitions have already been printed.
|
||||
*/
|
||||
static const char *ppcg_max_printed = "ppcg_max_printed";
|
||||
static const char *ppcg_min_printed = "ppcg_min_printed";
|
||||
|
||||
/* Has the macro definition corresponding to "note_name" been printed
|
||||
* to "p" before?
|
||||
* That is, does "p" have an associated "note_name" note?
|
||||
*/
|
||||
static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name)
|
||||
{
|
||||
isl_ctx *ctx;
|
||||
isl_id *id;
|
||||
isl_bool printed;
|
||||
|
||||
if (!p)
|
||||
return isl_bool_error;
|
||||
|
||||
ctx = isl_printer_get_ctx(p);
|
||||
id = isl_id_alloc(ctx, note_name, NULL);
|
||||
printed = isl_printer_has_note(p, id);
|
||||
isl_id_free(id);
|
||||
|
||||
return printed;
|
||||
}
|
||||
|
||||
/* Keep track of the fact that the macro definition corresponding
|
||||
* to "note_name" has been printed to "p" by attaching a note with
|
||||
* that name. The value of the note is of no importance, but it
|
||||
* has to be a valid isl_id, so the note identifier is reused
|
||||
* as the note.
|
||||
*/
|
||||
static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p,
|
||||
const char *note_name)
|
||||
{
|
||||
isl_ctx *ctx;
|
||||
isl_id *id;
|
||||
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_printer_get_ctx(p);
|
||||
id = isl_id_alloc(ctx, note_name, NULL);
|
||||
return isl_printer_set_note(p, id, isl_id_copy(id));
|
||||
}
|
||||
|
||||
/* Print a macro definition "def" for the macro "name" to "p",
|
||||
* unless such a macro definition has been printed to "p" before.
|
||||
* "note_name" is used as the name of the note that keeps track
|
||||
* of whether this printing has happened.
|
||||
*/
|
||||
static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p,
|
||||
const char *name, const char *def, const char *note_name)
|
||||
{
|
||||
isl_bool printed;
|
||||
|
||||
printed = printed_before(p, note_name);
|
||||
if (printed < 0)
|
||||
return isl_printer_free(p);
|
||||
if (printed)
|
||||
return p;
|
||||
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, "#define ");
|
||||
p = isl_printer_print_str(p, name);
|
||||
p = isl_printer_print_str(p, def);
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
p = mark_printed(p, note_name);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Structure for keeping track of definitions of some macros.
|
||||
*/
|
||||
struct ppcg_macros {
|
||||
const char *min;
|
||||
const char *max;
|
||||
};
|
||||
|
||||
/* Free the memory allocated by a struct ppcg_macros.
|
||||
*/
|
||||
static void ppcg_macros_free(void *user)
|
||||
{
|
||||
free(user);
|
||||
}
|
||||
|
||||
/* Default macro definitions (when GNU extensions are allowed).
|
||||
*/
|
||||
struct ppcg_macros ppcg_macros_default = {
|
||||
.min = "(x,y) "
|
||||
"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
|
||||
"_x < _y ? _x : _y; })",
|
||||
.max = "(x,y) "
|
||||
"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
|
||||
"_x > _y ? _x : _y; })",
|
||||
};
|
||||
|
||||
/* Name used for the note that keeps track of macro definitions.
|
||||
*/
|
||||
static const char *ppcg_macros = "ppcg_macros";
|
||||
|
||||
/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max
|
||||
* to "min" and "max" and store them in "p".
|
||||
*
|
||||
* In particular, create a ppcg_macros object and attach it
|
||||
* as a note to the printer.
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
|
||||
const char *min, const char *max)
|
||||
{
|
||||
isl_ctx *ctx;
|
||||
isl_id *id, *macros_id;
|
||||
struct ppcg_macros *macros;
|
||||
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_printer_get_ctx(p);
|
||||
macros = isl_alloc_type(ctx, struct ppcg_macros);
|
||||
if (!macros)
|
||||
return isl_printer_free(p);
|
||||
macros->min = min;
|
||||
macros->max = max;
|
||||
id = isl_id_alloc(ctx, ppcg_macros, NULL);
|
||||
macros_id = isl_id_alloc(ctx, NULL, macros);
|
||||
if (!macros_id)
|
||||
ppcg_macros_free(macros);
|
||||
else
|
||||
macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free);
|
||||
|
||||
p = isl_printer_set_note(p, id, macros_id);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Return the ppcg_macros object that holds the currently active
|
||||
* macro definitions in "p".
|
||||
* If "p" has a note with macro definitions, then return those.
|
||||
* Otherwise, return the default macro definitions.
|
||||
*/
|
||||
static struct ppcg_macros *get_macros(__isl_keep isl_printer *p)
|
||||
{
|
||||
isl_id *id;
|
||||
isl_bool has_macros;
|
||||
struct ppcg_macros *macros;
|
||||
|
||||
id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL);
|
||||
has_macros = isl_printer_has_note(p, id);
|
||||
if (has_macros < 0 || !has_macros) {
|
||||
isl_id_free(id);
|
||||
if (has_macros < 0)
|
||||
return NULL;
|
||||
return &ppcg_macros_default;
|
||||
}
|
||||
id = isl_printer_get_note(p, id);
|
||||
macros = isl_id_get_user(id);
|
||||
isl_id_free(id);
|
||||
|
||||
return macros;
|
||||
}
|
||||
|
||||
/* Print the currently active macro definition for ppcg_max.
|
||||
*/
|
||||
static __isl_give isl_printer *print_max(__isl_take isl_printer *p)
|
||||
{
|
||||
struct ppcg_macros *macros;
|
||||
|
||||
macros = get_macros(p);
|
||||
if (!macros)
|
||||
return isl_printer_free(p);
|
||||
return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed);
|
||||
}
|
||||
|
||||
/* Print the currently active macro definition for ppcg_min.
|
||||
*/
|
||||
static __isl_give isl_printer *print_min(__isl_take isl_printer *p)
|
||||
{
|
||||
struct ppcg_macros *macros;
|
||||
|
||||
macros = get_macros(p);
|
||||
if (!macros)
|
||||
return isl_printer_free(p);
|
||||
return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed);
|
||||
}
|
||||
|
||||
/* Print a macro definition for "type" to "p".
|
||||
* If GNU extensions are allowed, then print a specialized definition
|
||||
* for isl_ast_op_min and isl_ast_op_max.
|
||||
* Otherwise, use the default isl definition.
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
|
||||
__isl_take isl_printer *p)
|
||||
{
|
||||
isl_ctx *ctx;
|
||||
struct ppcg_options *options;
|
||||
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_printer_get_ctx(p);
|
||||
options = isl_ctx_peek_options(ctx, &ppcg_options_args);
|
||||
if (!options || !options->allow_gnu_extensions)
|
||||
return isl_ast_op_type_print_macro(type, p);
|
||||
|
||||
switch (type) {
|
||||
case isl_ast_op_max:
|
||||
return print_max(p);
|
||||
case isl_ast_op_min:
|
||||
return print_min(p);
|
||||
default:
|
||||
return isl_ast_op_type_print_macro(type, p);
|
||||
}
|
||||
}
|
||||
|
||||
/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type
|
||||
* callback that prints a macro definition for "type".
|
||||
*/
|
||||
static isl_stat print_macro(enum isl_ast_op_type type, void *user)
|
||||
{
|
||||
isl_printer **p = user;
|
||||
|
||||
*p = ppcg_print_macro(type, *p);
|
||||
if (!*p)
|
||||
return isl_stat_error;
|
||||
|
||||
return isl_stat_ok;
|
||||
}
|
||||
|
||||
/* Print the required macros for "expr".
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_ast_expr_print_macros(
|
||||
__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p)
|
||||
{
|
||||
if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0)
|
||||
return isl_printer_free(p);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* isl_id_to_ast_expr_foreach callback that prints the required
|
||||
* macro definitions for "val".
|
||||
*/
|
||||
static isl_stat print_expr_macros(__isl_take isl_id *key,
|
||||
__isl_take isl_ast_expr *val, void *user)
|
||||
{
|
||||
isl_printer **p = user;
|
||||
|
||||
*p = ppcg_ast_expr_print_macros(val, *p);
|
||||
isl_id_free(key);
|
||||
isl_ast_expr_free(val);
|
||||
|
||||
if (!*p)
|
||||
return isl_stat_error;
|
||||
return isl_stat_ok;
|
||||
}
|
||||
|
||||
/* Print the required macro definitions for the body of a statement in which
|
||||
* the access expressions are replaced by the isl_ast_expr objects
|
||||
* in "ref2expr".
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
|
||||
__isl_keep isl_id_to_ast_expr *ref2expr)
|
||||
{
|
||||
if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0)
|
||||
return isl_printer_free(p);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print the required macros for "node".
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
|
||||
__isl_keep isl_ast_node *node)
|
||||
{
|
||||
if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
|
||||
return isl_printer_free(p);
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Names used for the macros that may appear in a printed isl AST.
|
||||
*/
|
||||
const char *ppcg_min = "ppcg_min";
|
||||
const char *ppcg_max = "ppcg_max";
|
||||
const char *ppcg_fdiv_q = "ppcg_fdiv_q";
|
||||
|
||||
/* Set the names of the macros that may appear in a printed isl AST.
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p)
|
||||
{
|
||||
p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min);
|
||||
p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max);
|
||||
p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Given a multi affine expression "mpa" without domain, modify it to have
|
||||
* the schedule space of "build" as domain.
|
||||
*
|
||||
* If the schedule space of "build" is a parameter space, then nothing
|
||||
* needs to be done.
|
||||
* Otherwise, "mpa" is first given a 0D domain and then it is combined
|
||||
* with a mapping from the schedule space of "build" to the same 0D domain.
|
||||
*/
|
||||
__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff(
|
||||
__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build)
|
||||
{
|
||||
isl_bool params;
|
||||
isl_space *space;
|
||||
isl_multi_aff *ma;
|
||||
|
||||
space = isl_ast_build_get_schedule_space(build);
|
||||
params = isl_space_is_params(space);
|
||||
if (params < 0 || params) {
|
||||
isl_space_free(space);
|
||||
if (params < 0)
|
||||
return isl_multi_pw_aff_free(mpa);
|
||||
return mpa;
|
||||
}
|
||||
space = isl_space_from_domain(space);
|
||||
ma = isl_multi_aff_zero(space);
|
||||
mpa = isl_multi_pw_aff_from_range(mpa);
|
||||
mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma);
|
||||
|
||||
return mpa;
|
||||
}
|
||||
|
||||
/* Build an access AST expression from "size" using "build".
|
||||
* "size" does not have a domain, but "build" may have a proper schedule space.
|
||||
* First modify "size" to have that schedule space as domain.
|
||||
*/
|
||||
__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
|
||||
__isl_keep isl_ast_build *build)
|
||||
{
|
||||
size = ppcg_attach_multi_pw_aff(size, build);
|
||||
return isl_ast_build_access_from_multi_pw_aff(build, size);
|
||||
}
|
||||
|
||||
/* Print a declaration for an array with element type "base_type" and
|
||||
* size "size" to "p".
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_print_declaration_with_size(
|
||||
__isl_take isl_printer *p, const char *base_type,
|
||||
__isl_keep isl_ast_expr *size)
|
||||
{
|
||||
if (!base_type || !size)
|
||||
return isl_printer_free(p);
|
||||
|
||||
p = ppcg_ast_expr_print_macros(size, p);
|
||||
p = isl_printer_start_line(p);
|
||||
p = isl_printer_print_str(p, base_type);
|
||||
p = isl_printer_print_str(p, " ");
|
||||
p = isl_printer_print_ast_expr(p, size);
|
||||
p = isl_printer_print_str(p, ";");
|
||||
p = isl_printer_end_line(p);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print a declaration for array "array" to "p", using "build"
|
||||
* to simplify any size expressions.
|
||||
*
|
||||
* The size is computed from the extent of the array and is
|
||||
* subsequently converted to an "access expression" by "build".
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
|
||||
struct pet_array *array, __isl_keep isl_ast_build *build)
|
||||
{
|
||||
isl_multi_pw_aff *size;
|
||||
isl_ast_expr *expr;
|
||||
|
||||
if (!array)
|
||||
return isl_printer_free(p);
|
||||
|
||||
size = ppcg_size_from_extent(isl_set_copy(array->extent));
|
||||
expr = isl_ast_build_access_from_multi_pw_aff(build, size);
|
||||
p = ppcg_print_declaration_with_size(p, array->element_type, expr);
|
||||
isl_ast_expr_free(expr);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print declarations for the arrays in "scop" that are declared
|
||||
* and that are exposed (if exposed == 1) or not exposed (if exposed == 0).
|
||||
*/
|
||||
static __isl_give isl_printer *print_declarations(__isl_take isl_printer *p,
|
||||
struct ppcg_scop *scop, int exposed)
|
||||
{
|
||||
int i;
|
||||
isl_ast_build *build;
|
||||
|
||||
if (!scop)
|
||||
return isl_printer_free(p);
|
||||
|
||||
build = isl_ast_build_from_context(isl_set_copy(scop->context));
|
||||
for (i = 0; i < scop->pet->n_array; ++i) {
|
||||
struct pet_array *array = scop->pet->arrays[i];
|
||||
|
||||
if (!array->declared)
|
||||
continue;
|
||||
if (array->exposed != exposed)
|
||||
continue;
|
||||
|
||||
p = ppcg_print_declaration(p, array, build);
|
||||
}
|
||||
isl_ast_build_free(build);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Print declarations for the arrays in "scop" that are declared
|
||||
* and exposed to the code after the scop.
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_print_exposed_declarations(
|
||||
__isl_take isl_printer *p, struct ppcg_scop *scop)
|
||||
{
|
||||
return print_declarations(p, scop, 1);
|
||||
}
|
||||
|
||||
/* Print declarations for the arrays in "scop" that are declared,
|
||||
* but not exposed to the code after the scop.
|
||||
*/
|
||||
__isl_give isl_printer *ppcg_print_hidden_declarations(
|
||||
__isl_take isl_printer *p, struct ppcg_scop *scop)
|
||||
{
|
||||
return print_declarations(p, scop, 0);
|
||||
}
|
||||
40
polly/lib/External/ppcg/print.h
vendored
40
polly/lib/External/ppcg/print.h
vendored
@ -1,40 +0,0 @@
|
||||
#ifndef PRINT_H
|
||||
#define PRINT_H
|
||||
|
||||
#include <isl/ast.h>
|
||||
|
||||
#include "ppcg.h"
|
||||
|
||||
extern const char *ppcg_min;
|
||||
extern const char *ppcg_max;
|
||||
extern const char *ppcg_fdiv_q;
|
||||
|
||||
__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p);
|
||||
__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p);
|
||||
|
||||
__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p);
|
||||
__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
|
||||
const char *min, const char *max);
|
||||
__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
|
||||
__isl_take isl_printer *p);
|
||||
__isl_give isl_printer *ppcg_ast_expr_print_macros(
|
||||
__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p);
|
||||
__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
|
||||
__isl_keep isl_id_to_ast_expr *ref2expr);
|
||||
__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
|
||||
__isl_keep isl_ast_node *node);
|
||||
|
||||
__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
|
||||
__isl_keep isl_ast_build *build);
|
||||
|
||||
__isl_give isl_printer *ppcg_print_declaration_with_size(
|
||||
__isl_take isl_printer *p, const char *base_type,
|
||||
__isl_keep isl_ast_expr *size);
|
||||
__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
|
||||
struct pet_array *array, __isl_keep isl_ast_build *build);
|
||||
__isl_give isl_printer *ppcg_print_exposed_declarations(
|
||||
__isl_take isl_printer *p, struct ppcg_scop *scop);
|
||||
__isl_give isl_printer *ppcg_print_hidden_declarations(
|
||||
__isl_take isl_printer *p, struct ppcg_scop *scop);
|
||||
|
||||
#endif
|
||||
165
polly/lib/External/ppcg/schedule.c
vendored
165
polly/lib/External/ppcg/schedule.c
vendored
@ -1,165 +0,0 @@
|
||||
/*
|
||||
* Copyright 2010-2011 INRIA Saclay
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
|
||||
* Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
|
||||
* 91893 Orsay, France
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <isl/set.h>
|
||||
#include <isl/map.h>
|
||||
#include <isl/constraint.h>
|
||||
|
||||
#include "schedule.h"
|
||||
|
||||
/* Add parameters with identifiers "ids" to "set".
|
||||
*/
|
||||
static __isl_give isl_set *add_params(__isl_take isl_set *set,
|
||||
__isl_keep isl_id_list *ids)
|
||||
{
|
||||
int i, n;
|
||||
unsigned nparam;
|
||||
|
||||
n = isl_id_list_n_id(ids);
|
||||
|
||||
nparam = isl_set_dim(set, isl_dim_param);
|
||||
set = isl_set_add_dims(set, isl_dim_param, n);
|
||||
|
||||
for (i = 0; i < n; ++i) {
|
||||
isl_id *id;
|
||||
|
||||
id = isl_id_list_get_id(ids, i);
|
||||
set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
|
||||
}
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
/* Equate the dimensions of "set" starting at "first" to
|
||||
* freshly created parameters with identifiers "ids".
|
||||
* The number of equated dimensions is equal to the number of elements in "ids".
|
||||
*/
|
||||
static __isl_give isl_set *parametrize(__isl_take isl_set *set,
|
||||
int first, __isl_keep isl_id_list *ids)
|
||||
{
|
||||
int i, n;
|
||||
unsigned nparam;
|
||||
|
||||
nparam = isl_set_dim(set, isl_dim_param);
|
||||
|
||||
set = add_params(set, ids);
|
||||
|
||||
n = isl_id_list_n_id(ids);
|
||||
for (i = 0; i < n; ++i)
|
||||
set = isl_set_equate(set, isl_dim_param, nparam + i,
|
||||
isl_dim_set, first + i);
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
/* Given a parameter space "space", create a set of dimension "len"
|
||||
* of which the dimensions starting at "first" are equated to
|
||||
* freshly created parameters with identifiers "ids".
|
||||
*/
|
||||
__isl_give isl_set *parametrization(__isl_take isl_space *space,
|
||||
int len, int first, __isl_keep isl_id_list *ids)
|
||||
{
|
||||
isl_set *set;
|
||||
|
||||
space = isl_space_set_from_params(space);
|
||||
space = isl_space_add_dims(space, isl_dim_set, len);
|
||||
set = isl_set_universe(space);
|
||||
|
||||
return parametrize(set, first, ids);
|
||||
}
|
||||
|
||||
/* Load and return a schedule from a file called "filename".
|
||||
*/
|
||||
static __isl_give isl_schedule *load_schedule(isl_ctx *ctx,
|
||||
const char *filename)
|
||||
{
|
||||
FILE *file;
|
||||
isl_schedule *schedule;
|
||||
|
||||
file = fopen(filename, "r");
|
||||
if (!file) {
|
||||
fprintf(stderr, "Unable to open '%s' for reading\n", filename);
|
||||
return NULL;
|
||||
}
|
||||
schedule = isl_schedule_read_from_file(ctx, file);
|
||||
fclose(file);
|
||||
|
||||
return schedule;
|
||||
}
|
||||
|
||||
/* Save the schedule "schedule" to a file called "filename".
|
||||
* The schedule is printed in block style.
|
||||
*/
|
||||
static void save_schedule(__isl_keep isl_schedule *schedule,
|
||||
const char *filename)
|
||||
{
|
||||
FILE *file;
|
||||
isl_ctx *ctx;
|
||||
isl_printer *p;
|
||||
|
||||
if (!schedule)
|
||||
return;
|
||||
|
||||
file = fopen(filename, "w");
|
||||
if (!file) {
|
||||
fprintf(stderr, "Unable to open '%s' for writing\n", filename);
|
||||
return;
|
||||
}
|
||||
ctx = isl_schedule_get_ctx(schedule);
|
||||
p = isl_printer_to_file(ctx, file);
|
||||
p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
|
||||
p = isl_printer_print_schedule(p, schedule);
|
||||
isl_printer_free(p);
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
/* Obtain a schedule, either by reading it form a file
|
||||
* or by computing it using "compute".
|
||||
* Also take care of saving the computed schedule and/or
|
||||
* dumping the obtained schedule if requested by the user.
|
||||
*/
|
||||
__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
|
||||
struct ppcg_options *options,
|
||||
__isl_give isl_schedule *(*compute)(void *user), void *user)
|
||||
{
|
||||
isl_schedule *schedule;
|
||||
|
||||
if (options->load_schedule_file) {
|
||||
schedule = load_schedule(ctx, options->load_schedule_file);
|
||||
} else {
|
||||
schedule = compute(user);
|
||||
if (options->save_schedule_file)
|
||||
save_schedule(schedule, options->save_schedule_file);
|
||||
}
|
||||
if (options->debug->dump_schedule)
|
||||
isl_schedule_dump(schedule);
|
||||
|
||||
return schedule;
|
||||
}
|
||||
|
||||
/* Mark all dimensions in the band node "node" to be of "type".
|
||||
*/
|
||||
__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
|
||||
__isl_take isl_schedule_node *node, enum isl_ast_loop_type type)
|
||||
{
|
||||
int i, n;
|
||||
|
||||
n = isl_schedule_node_band_n_member(node);
|
||||
for (i = 0; i < n; ++i)
|
||||
node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
|
||||
type);
|
||||
|
||||
return node;
|
||||
}
|
||||
21
polly/lib/External/ppcg/schedule.h
vendored
21
polly/lib/External/ppcg/schedule.h
vendored
@ -1,21 +0,0 @@
|
||||
#ifndef _SCHEDULE_H
|
||||
#define _SCHEDULE_H
|
||||
|
||||
#include <isl/id.h>
|
||||
#include <isl/space.h>
|
||||
#include <isl/schedule.h>
|
||||
#include <isl/schedule_node.h>
|
||||
|
||||
#include "ppcg_options.h"
|
||||
|
||||
__isl_give isl_set *parametrization(__isl_take isl_space *space,
|
||||
int len, int first, __isl_keep isl_id_list *names);
|
||||
|
||||
__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
|
||||
struct ppcg_options *options,
|
||||
__isl_give isl_schedule *(*compute)(void *user), void *user);
|
||||
|
||||
__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
|
||||
__isl_take isl_schedule_node *node, enum isl_ast_loop_type type);
|
||||
|
||||
#endif
|
||||
@ -1,49 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int A[2][1000][1000];
|
||||
int B[2][1000][1000];
|
||||
|
||||
#pragma scop
|
||||
{
|
||||
for (int i = 0; i < 256; ++i)
|
||||
for (int j = 0; j < 256; ++j)
|
||||
if (j % 8 <= 2 || j % 8 >= 6)
|
||||
A[1][i][j] = B[1][j][i];
|
||||
}
|
||||
#pragma endscop
|
||||
|
||||
/*
|
||||
|
||||
When compiled with:
|
||||
|
||||
./ppcg tests/allow-sparse-copy-in.c --no-linearize-device-arrays
|
||||
--on-error=abort --sizes='{kernel[i]->tile[8,8]; kernel[i]->block[1,8]}'
|
||||
--max-shared-memory=-1 --unroll-copy-shared
|
||||
|
||||
this originally resulted in the following copy-in code:
|
||||
|
||||
shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
|
||||
shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
|
||||
shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
|
||||
shared_B[0][3][t1] = B[1][8 * b1 + 3][8 * b0 + t1];
|
||||
shared_B[0][4][t1] = B[1][8 * b1 + 4][8 * b0 + t1];
|
||||
shared_B[0][5][t1] = B[1][8 * b1 + 5][8 * b0 + t1];
|
||||
shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
|
||||
shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
|
||||
|
||||
whereas we only want to only perform copies that are actually needed:
|
||||
|
||||
shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
|
||||
shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
|
||||
shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
|
||||
shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
|
||||
shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
|
||||
*/
|
||||
for (int i = 0; i < 100; ++i)
|
||||
if (A[1][0][i] != i)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
29
polly/lib/External/ppcg/tests/call.c
vendored
29
polly/lib/External/ppcg/tests/call.c
vendored
@ -1,29 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
void copy_summary(int b[1000], int a[1000], int pos)
|
||||
{
|
||||
b[pos] = 0;
|
||||
int c = a[pos];
|
||||
}
|
||||
|
||||
#ifdef pencil_access
|
||||
__attribute__((pencil_access(copy_summary)))
|
||||
#endif
|
||||
void copy(int b[1000], int a[1000], int pos);
|
||||
|
||||
int main()
|
||||
{
|
||||
int a[1000], b[1000];
|
||||
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
a[i] = i;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
copy(b, a, i);
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
if (b[i] != a[i])
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
29
polly/lib/External/ppcg/tests/call2.c
vendored
29
polly/lib/External/ppcg/tests/call2.c
vendored
@ -1,29 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
void copy_summary(int b[1000], int a[1000], int pos)
|
||||
{
|
||||
b[pos] = 0;
|
||||
int c = a[pos];
|
||||
}
|
||||
|
||||
#ifdef pencil_access
|
||||
__attribute__((pencil_access(copy_summary)))
|
||||
#endif
|
||||
void copy(int b[1000], int a[1000], int pos);
|
||||
|
||||
int main()
|
||||
{
|
||||
int a[2][1000];
|
||||
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
a[0][i] = i;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
copy(a[1], a[0], i);
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
if (a[1][i] != a[0][i])
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@ -1,4 +0,0 @@
|
||||
void copy(__global int b[1000], __global int a[1000], int pos)
|
||||
{
|
||||
b[pos] = a[pos];
|
||||
}
|
||||
32
polly/lib/External/ppcg/tests/call3.c
vendored
32
polly/lib/External/ppcg/tests/call3.c
vendored
@ -1,32 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
void copy_summary(int b[100], int a[100])
|
||||
{
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
b[i] = 0;
|
||||
int c = a[i];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef pencil_access
|
||||
__attribute__((pencil_access(copy_summary)))
|
||||
#endif
|
||||
void copy(int b[100], int a[100]);
|
||||
|
||||
int main()
|
||||
{
|
||||
int A[100][100], B[100];
|
||||
|
||||
for (int i = 0; i < 100; ++i)
|
||||
B[i] = i;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 100; ++i)
|
||||
copy(A[i], B);
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 100; ++i)
|
||||
for (int j = 0; j < 100; ++j)
|
||||
if (A[j][i] != B[i])
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@ -1,5 +0,0 @@
|
||||
void copy(__global int b[100], __global int a[100])
|
||||
{
|
||||
for (int i = 0; i < 100; ++i)
|
||||
b[i] = a[i];
|
||||
}
|
||||
@ -1,4 +0,0 @@
|
||||
void copy(__global int b[1000], __global int a[1000], int pos)
|
||||
{
|
||||
b[pos] = a[pos];
|
||||
}
|
||||
23
polly/lib/External/ppcg/tests/dead.c
vendored
23
polly/lib/External/ppcg/tests/dead.c
vendored
@ -1,23 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int a[1000], b[1000];
|
||||
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
a[i] = i;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
int c;
|
||||
int d;
|
||||
c = a[i];
|
||||
d = c;
|
||||
b[i] = c;
|
||||
}
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
if (b[i] != a[i])
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
18
polly/lib/External/ppcg/tests/iterator.c
vendored
18
polly/lib/External/ppcg/tests/iterator.c
vendored
@ -1,18 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int i;
|
||||
int a[101];
|
||||
|
||||
i = 0;
|
||||
#pragma scop
|
||||
for (i = 0; i < 100; ++i)
|
||||
a[i] = i;
|
||||
a[i] = i;
|
||||
#pragma endscop
|
||||
if (a[100] != 100)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
22
polly/lib/External/ppcg/tests/live_out.c
vendored
22
polly/lib/External/ppcg/tests/live_out.c
vendored
@ -1,22 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
/* Check that a write access is not removed from the live-out
|
||||
* accesses only because a strict subset of the (potentially)
|
||||
* accessed elements are killed by a later write.
|
||||
*/
|
||||
int main()
|
||||
{
|
||||
int A[10];
|
||||
|
||||
A[1] = 0;
|
||||
#pragma scop
|
||||
int i = 1;
|
||||
i = i * i;
|
||||
A[i] = 1;
|
||||
A[0] = 0;
|
||||
#pragma endscop
|
||||
if (A[1] != 1)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
22
polly/lib/External/ppcg/tests/local.c
vendored
22
polly/lib/External/ppcg/tests/local.c
vendored
@ -1,22 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int A[100];
|
||||
|
||||
#pragma scop
|
||||
{
|
||||
int B[100];
|
||||
B[0] = 0;
|
||||
for (int i = 1; i < 100; ++i)
|
||||
B[i] = B[i - 1] + 1;
|
||||
for (int i = 0; i < 100; ++i)
|
||||
A[i] = B[i];
|
||||
}
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 100; ++i)
|
||||
if (A[i] != i)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
18
polly/lib/External/ppcg/tests/loop.c
vendored
18
polly/lib/External/ppcg/tests/loop.c
vendored
@ -1,18 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int a[1000], b[1000];
|
||||
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
a[i] = i;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
b[i] = a[i];
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
if (b[i] != a[i])
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
29
polly/lib/External/ppcg/tests/not_accessed.c
vendored
29
polly/lib/External/ppcg/tests/not_accessed.c
vendored
@ -1,29 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
void copy_summary(int b[1000], int a[1000], int pos, int c[1000])
|
||||
{
|
||||
b[pos] = 0;
|
||||
int d = a[pos];
|
||||
}
|
||||
|
||||
#ifdef pencil_access
|
||||
__attribute__((pencil_access(copy_summary)))
|
||||
#endif
|
||||
void copy(int b[1000], int a[1000], int pos, int c[1000]);
|
||||
|
||||
int main()
|
||||
{
|
||||
int a[1000], b[1000], c[1000];
|
||||
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
a[i] = i;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
copy(b, a, i, c);
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 1000; ++i)
|
||||
if (b[i] != a[i])
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
@ -1,5 +0,0 @@
|
||||
void copy(__global int b[1000], __global int a[1000], int pos,
|
||||
__global int c[1000])
|
||||
{
|
||||
b[pos] = a[pos];
|
||||
}
|
||||
13
polly/lib/External/ppcg/tests/scalar.c
vendored
13
polly/lib/External/ppcg/tests/scalar.c
vendored
@ -1,13 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
int a;
|
||||
#pragma scop
|
||||
a = 1;
|
||||
#pragma endscop
|
||||
if (a != 1)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
25
polly/lib/External/ppcg/tests/shared_sink.c
vendored
25
polly/lib/External/ppcg/tests/shared_sink.c
vendored
@ -1,25 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
/* Check that the sources of live ranges with the same sink
|
||||
* are executed in order.
|
||||
*/
|
||||
int main()
|
||||
{
|
||||
int A[128];
|
||||
int n = 128;
|
||||
|
||||
A[0] = 0;
|
||||
#pragma scop
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int set = 0;
|
||||
if (A[i] < 2)
|
||||
set = 1;
|
||||
if (set)
|
||||
A[i] = 2;
|
||||
}
|
||||
#pragma endscop
|
||||
if (A[0] != 2)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
31
polly/lib/External/ppcg/tests/struct.c
vendored
31
polly/lib/External/ppcg/tests/struct.c
vendored
@ -1,31 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
struct s {
|
||||
int c[10][10];
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
struct s a[10][10], b[10][10];
|
||||
|
||||
for (int i = 0; i < 10; ++i)
|
||||
for (int j = 0; j < 10; ++j)
|
||||
for (int k = 0; k < 10; ++k)
|
||||
for (int l = 0; l < 10; ++l)
|
||||
a[i][j].c[k][l] = i + j + k + l;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 10; ++i)
|
||||
for (int j = 0; j < 10; ++j)
|
||||
for (int k = 0; k < 10; ++k)
|
||||
for (int l = 0; l < 10; ++l)
|
||||
b[i][j].c[k][l] = i + j + k + l;
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 10; ++i)
|
||||
for (int j = 0; j < 10; ++j)
|
||||
for (int k = 0; k < 10; ++k)
|
||||
for (int l = 0; l < 10; ++l)
|
||||
if (b[i][j].c[k][l] != a[i][j].c[k][l])
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
21
polly/lib/External/ppcg/tests/struct2.c
vendored
21
polly/lib/External/ppcg/tests/struct2.c
vendored
@ -1,21 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
struct s {
|
||||
int a;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
struct s a, b[10];
|
||||
|
||||
#pragma scop
|
||||
a.a = 42;
|
||||
for (int i = 0; i < 10; ++i)
|
||||
b[i].a = a.a;
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 10; ++i)
|
||||
if (b[i].a != 42)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
25
polly/lib/External/ppcg/tests/struct3.c
vendored
25
polly/lib/External/ppcg/tests/struct3.c
vendored
@ -1,25 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
struct s {
|
||||
int a;
|
||||
int b;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
struct s a, b[10];
|
||||
|
||||
a.b = 57;
|
||||
#pragma scop
|
||||
a.a = 42;
|
||||
for (int i = 0; i < 10; ++i)
|
||||
b[i] = a;
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 10; ++i)
|
||||
if (b[i].a != 42)
|
||||
return EXIT_FAILURE;
|
||||
if (a.b != 57)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
27
polly/lib/External/ppcg/tests/struct4.c
vendored
27
polly/lib/External/ppcg/tests/struct4.c
vendored
@ -1,27 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
struct s {
|
||||
int a;
|
||||
int b;
|
||||
};
|
||||
|
||||
int main()
|
||||
{
|
||||
int a[10];
|
||||
|
||||
for (int i = 0; i < 10; ++i)
|
||||
a[i] = 0;
|
||||
#pragma scop
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
struct s b;
|
||||
b.a = 1;
|
||||
b.b = i;
|
||||
a[i] = b.a + b.b;
|
||||
}
|
||||
#pragma endscop
|
||||
for (int i = 0; i < 10; ++i)
|
||||
if (a[i] != 1 + i)
|
||||
return EXIT_FAILURE;
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
105
polly/lib/External/ppcg/util.c
vendored
105
polly/lib/External/ppcg/util.c
vendored
@ -1,105 +0,0 @@
|
||||
/*
|
||||
* Copyright 2012-2013 Ecole Normale Superieure
|
||||
*
|
||||
* Use of this software is governed by the MIT license
|
||||
*
|
||||
* Written by Sven Verdoolaege,
|
||||
* Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
|
||||
*/
|
||||
|
||||
#include <isl/space.h>
|
||||
#include <isl/val.h>
|
||||
#include <isl/aff.h>
|
||||
#include <isl/set.h>
|
||||
|
||||
#include "util.h"
|
||||
|
||||
/* Construct an isl_multi_val living in "space" with all values equal to "val".
|
||||
*/
|
||||
__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
|
||||
int val)
|
||||
{
|
||||
int i, n;
|
||||
isl_ctx *ctx;
|
||||
isl_val *v;
|
||||
isl_multi_val *mv;
|
||||
|
||||
if (!space)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_space_get_ctx(space);
|
||||
n = isl_space_dim(space, isl_dim_set);
|
||||
mv = isl_multi_val_zero(space);
|
||||
v = isl_val_int_from_si(ctx, val);
|
||||
for (i = 0; i < n; ++i)
|
||||
mv = isl_multi_val_set_val(mv, i, isl_val_copy(v));
|
||||
isl_val_free(v);
|
||||
|
||||
return mv;
|
||||
}
|
||||
|
||||
/* Construct an isl_multi_val living in "space" with values specified
|
||||
* by "list". "list" is assumed to have at least as many entries
|
||||
* as the set dimension of "space".
|
||||
*/
|
||||
__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
|
||||
__isl_take isl_space *space, int *list)
|
||||
{
|
||||
int i, n;
|
||||
isl_ctx *ctx;
|
||||
isl_multi_val *mv;
|
||||
|
||||
if (!space)
|
||||
return NULL;
|
||||
|
||||
ctx = isl_space_get_ctx(space);
|
||||
n = isl_space_dim(space, isl_dim_set);
|
||||
mv = isl_multi_val_zero(space);
|
||||
for (i = 0; i < n; ++i) {
|
||||
isl_val *v;
|
||||
|
||||
v = isl_val_int_from_si(ctx, list[i]);
|
||||
mv = isl_multi_val_set_val(mv, i, v);
|
||||
}
|
||||
|
||||
return mv;
|
||||
}
|
||||
|
||||
/* Compute the size of a bounding box around the origin and "set",
|
||||
* where "set" is assumed to contain only non-negative elements.
|
||||
* In particular, compute the maximal value of "set" in each direction
|
||||
* and add one.
|
||||
*/
|
||||
__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set)
|
||||
{
|
||||
int i, n;
|
||||
isl_multi_pw_aff *mpa;
|
||||
|
||||
n = isl_set_dim(set, isl_dim_set);
|
||||
mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
|
||||
for (i = 0; i < n; ++i) {
|
||||
isl_space *space;
|
||||
isl_aff *one;
|
||||
isl_pw_aff *bound;
|
||||
|
||||
if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) {
|
||||
const char *name;
|
||||
name = isl_set_get_tuple_name(set);
|
||||
if (!name)
|
||||
name = "";
|
||||
fprintf(stderr, "unable to determine extent of '%s' "
|
||||
"in dimension %d\n", name, i);
|
||||
set = isl_set_free(set);
|
||||
}
|
||||
bound = isl_set_dim_max(isl_set_copy(set), i);
|
||||
|
||||
space = isl_pw_aff_get_domain_space(bound);
|
||||
one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
|
||||
one = isl_aff_add_constant_si(one, 1);
|
||||
bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
|
||||
mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
|
||||
}
|
||||
isl_set_free(set);
|
||||
|
||||
return mpa;
|
||||
}
|
||||
22
polly/lib/External/ppcg/util.h
vendored
22
polly/lib/External/ppcg/util.h
vendored
@ -1,22 +0,0 @@
|
||||
#ifndef UTIL_H
|
||||
#define UTIL_H
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <isl/space.h>
|
||||
#include <isl/val.h>
|
||||
|
||||
/* Compare the prefix of "s" to "prefix" up to the length of "prefix".
|
||||
*/
|
||||
static inline int prefixcmp(const char *s, const char *prefix)
|
||||
{
|
||||
return strncmp(s, prefix, strlen(prefix));
|
||||
}
|
||||
|
||||
__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
|
||||
int val);
|
||||
__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
|
||||
__isl_take isl_space *space, int *list);
|
||||
__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set);
|
||||
|
||||
#endif
|
||||
6
polly/lib/External/ppcg/version.c
vendored
6
polly/lib/External/ppcg/version.c
vendored
@ -1,6 +0,0 @@
|
||||
#include "gitversion.h"
|
||||
|
||||
const char *ppcg_version(void)
|
||||
{
|
||||
return GIT_HEAD_ID"\n";
|
||||
}
|
||||
@ -217,14 +217,6 @@ static StaticInitializer InitializeEverything;
|
||||
void initializePollyPasses(llvm::PassRegistry &Registry) {
|
||||
initializeCodeGenerationPass(Registry);
|
||||
|
||||
#ifdef GPU_CODEGEN
|
||||
initializePPCGCodeGenerationPass(Registry);
|
||||
initializeManagedMemoryRewritePassPass(Registry);
|
||||
LLVMInitializeNVPTXTarget();
|
||||
LLVMInitializeNVPTXTargetInfo();
|
||||
LLVMInitializeNVPTXTargetMC();
|
||||
LLVMInitializeNVPTXAsmPrinter();
|
||||
#endif
|
||||
initializeCodePreparationPass(Registry);
|
||||
initializeDeadCodeElimWrapperPassPass(Registry);
|
||||
initializeDependenceInfoPass(Registry);
|
||||
|
||||
@ -711,11 +711,6 @@ static void runIslScheduleOptimizer(
|
||||
function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
|
||||
TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
|
||||
isl::schedule &LastSchedule, bool &DepsChanged) {
|
||||
|
||||
// Skip SCoPs in case they're already optimised by PPCGCodeGeneration
|
||||
if (S.isToBeSkipped())
|
||||
return;
|
||||
|
||||
// Skip empty SCoPs but still allow code generation as it will delete the
|
||||
// loops present but not needed.
|
||||
if (S.getSize() == 0) {
|
||||
|
||||
@ -1,9 +0,0 @@
|
||||
define float @__nv_expf(float %a) {
|
||||
ret float %a
|
||||
}
|
||||
define float @__nv_cosf(float %a) {
|
||||
ret float %a
|
||||
}
|
||||
define float @__nv_logf(float %a) {
|
||||
ret float %a
|
||||
}
|
||||
@ -1,71 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
|
||||
; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; Check that we detect a scop.
|
||||
; SCOP: Function: checkScalarKill
|
||||
; SCOP-NEXT: Region: %XLoopInit---%for.end
|
||||
; SCOP-NEXT: Max Loop Depth: 1
|
||||
|
||||
; Check that we have a scalar that is not a phi node in the scop.
|
||||
; SCOP: i32 MemRef_x_0; // Element size 4
|
||||
|
||||
; Check that kernel launch is generated in host IR.
|
||||
; the declare would not be generated unless a call to a kernel exists.
|
||||
; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
|
||||
|
||||
; Check that we add variables that are local to a scop into the kills that we
|
||||
; pass to PPCG. This should enable PPCG to codegen this example.
|
||||
; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
|
||||
; int x;
|
||||
; #pragma scop
|
||||
; for(int i = 0; i < 1000; i++) {
|
||||
; XLoopInit: x = 0;
|
||||
;
|
||||
; if (control1 > 2)
|
||||
; C1Add: x += 10;
|
||||
; if (control2 > 3)
|
||||
; C2Add: x += A[i];
|
||||
;
|
||||
; BLoopAccumX: B[i] += x;
|
||||
; }
|
||||
;
|
||||
; #pragma endscop
|
||||
; }
|
||||
; ModuleID = 'test.ll'
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) {
|
||||
entry:
|
||||
br label %entry.split
|
||||
|
||||
entry.split: ; preds = %entry
|
||||
br label %XLoopInit
|
||||
|
||||
XLoopInit: ; preds = %entry.split, %BLoopAccumX
|
||||
%indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
|
||||
%cmp1 = icmp sgt i32 %control1, 2
|
||||
%x.0 = select i1 %cmp1, i32 10, i32 0
|
||||
%cmp2 = icmp sgt i32 %control2, 3
|
||||
br i1 %cmp2, label %C2Add, label %BLoopAccumX
|
||||
|
||||
C2Add: ; preds = %XLoopInit
|
||||
%arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
|
||||
%tmp6 = load i32, ptr %arrayidx, align 4
|
||||
%add4 = add nsw i32 %tmp6, %x.0
|
||||
br label %BLoopAccumX
|
||||
|
||||
BLoopAccumX: ; preds = %XLoopInit, %C2Add
|
||||
%x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
|
||||
%arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
|
||||
%tmp11 = load i32, ptr %arrayidx7, align 4
|
||||
%add8 = add nsw i32 %tmp11, %x.1
|
||||
store i32 %add8, ptr %arrayidx7, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 1000
|
||||
br i1 %exitcond, label %XLoopInit, label %for.end
|
||||
|
||||
for.end: ; preds = %BLoopAccumX
|
||||
ret void
|
||||
}
|
||||
@ -1,53 +0,0 @@
|
||||
; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \
|
||||
; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \
|
||||
; RUN: FileCheck %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; CHECK: polly_launchKernel
|
||||
|
||||
; Verify that this program compiles. At some point, this compilation crashed
|
||||
; due to insufficient parameters being available.
|
||||
|
||||
source_filename = "bugpoint-output-4d01492.bc"
|
||||
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] }
|
||||
%struct.widget = type { i64, i64, i64 }
|
||||
|
||||
@global = external unnamed_addr global %struct.barney, align 32
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @wobble(ptr noalias %arg) #0 {
|
||||
bb:
|
||||
%tmp = load i32, ptr %arg, align 4
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb13, %bb
|
||||
%tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ]
|
||||
br label %bb3
|
||||
|
||||
bb3: ; preds = %bb3, %bb1
|
||||
%tmp4 = load ptr, ptr @global, align 32
|
||||
%tmp5 = sext i32 %tmp2 to i64
|
||||
%tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8
|
||||
%tmp7 = mul i64 %tmp6, %tmp5
|
||||
%tmp8 = add i64 %tmp7, 0
|
||||
%tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8
|
||||
%tmp10 = add i64 %tmp8, %tmp9
|
||||
%tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10
|
||||
store i32 undef, ptr %tmp11, align 4
|
||||
%tmp12 = icmp eq i32 0, 0
|
||||
br i1 %tmp12, label %bb13, label %bb3
|
||||
|
||||
bb13: ; preds = %bb3
|
||||
%tmp14 = icmp eq i32 %tmp2, %tmp
|
||||
%tmp15 = add i32 %tmp2, 1
|
||||
br i1 %tmp14, label %bb16, label %bb1
|
||||
|
||||
bb16: ; preds = %bb13
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind uwtable }
|
||||
@ -1,50 +0,0 @@
|
||||
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
|
||||
; RUN: -polly-use-llvm-names < %s
|
||||
; ModuleID = 'test/GPGPU/zero-size-array.ll'
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
|
||||
; We used to divide the element size by 8 to arrive at the 'actual' size
|
||||
; of an array element. This used to cause arrays that have an element size
|
||||
; of less than 8 to collapse to size 0. This test makes sure that it does
|
||||
; not happen anymore.
|
||||
|
||||
; f(int *niters_ptr, int *arr[0]) {
|
||||
; const int inters = *niters_ptr;
|
||||
; for(int i = 0; i < niters; i++) {
|
||||
; arr[0][i + 1] = 0
|
||||
; }
|
||||
; }
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 {
|
||||
entry:
|
||||
%niters = load i32, ptr %niters.ptr, align 4
|
||||
br label %loop.body
|
||||
|
||||
loop.body: ; preds = %loop.body, %entry
|
||||
%indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ]
|
||||
%indvar.sext = sext i32 %indvar to i64
|
||||
%arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext
|
||||
store i32 0, ptr %arr.slot, align 4
|
||||
%tmp8 = icmp eq i32 %indvar, %niters
|
||||
%indvar.next = add i32 %indvar, 1
|
||||
br i1 %tmp8, label %loop.exit, label %loop.body
|
||||
|
||||
loop.exit: ; preds = %loop.body
|
||||
%tmp10 = icmp sgt i32 undef, 0
|
||||
br label %auxiliary.loop
|
||||
|
||||
auxiliary.loop: ; preds = %"101", %loop.exit
|
||||
%tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ]
|
||||
br i1 undef, label %auxiliary.loop, label %exit
|
||||
|
||||
exit: ; preds = %auxiliary.loop
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind uwtable }
|
||||
@ -1,55 +0,0 @@
|
||||
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
|
||||
; RUN: -polly-ignore-parameter-bounds \
|
||||
; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR
|
||||
;
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain
|
||||
; all the parameters present in the program.
|
||||
;
|
||||
; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff`
|
||||
; to have the same parameter dimensions. To achieve this, we used to realign
|
||||
; every `pw_aff` with `Scop::Context`. However, in conjunction with
|
||||
; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context`
|
||||
; does not contain all parameters.
|
||||
;
|
||||
; We check that Polly does the right thing in this case and sets up the parameter
|
||||
; dimensions correctly.
|
||||
|
||||
|
||||
; Check that kernel launch is generated in host IR.
|
||||
; the declare would not be generated unless a call to a kernel exists.
|
||||
; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
|
||||
; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
|
||||
|
||||
; C pseudocode
|
||||
; ------------
|
||||
; void f(int *arr, long niters, long stride) {
|
||||
; for(int i = 0; i < niters; i++) {
|
||||
; arr[i * stride] = 1;
|
||||
; }
|
||||
; }
|
||||
|
||||
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 {
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop: ; preds = %loop, %entry
|
||||
%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
|
||||
%idx = mul nuw nsw i64 %indvar, %stride
|
||||
%slot = getelementptr i32, ptr %arr, i64 %idx
|
||||
store i32 1, ptr %slot, align 4
|
||||
%indvar.next = add nuw nsw i64 %indvar, 1
|
||||
%check = icmp sgt i64 %indvar.next, %niters
|
||||
br i1 %check, label %exit, label %loop
|
||||
|
||||
exit: ; preds = %loop
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind uwtable }
|
||||
@ -1,37 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=KERNEL %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 {
|
||||
|
||||
; KERNEL: !nvvm.annotations = !{!0}
|
||||
|
||||
; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @foo(ptr %A, i64 %n) {
|
||||
bb:
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb6, %bb
|
||||
%i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
|
||||
%tmp = icmp slt i64 %i.0, %n
|
||||
br i1 %tmp, label %bb2, label %bb8
|
||||
|
||||
bb2: ; preds = %bb1
|
||||
%tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0
|
||||
%tmp4 = load i64, ptr %tmp3, align 8
|
||||
%tmp5 = add nsw i64 %tmp4, 100
|
||||
store i64 %tmp5, ptr %tmp3, align 8
|
||||
br label %bb6
|
||||
|
||||
bb6: ; preds = %bb2
|
||||
%tmp7 = add nuw nsw i64 %i.0, 1
|
||||
br label %bb1
|
||||
|
||||
bb8: ; preds = %bb1
|
||||
ret void
|
||||
}
|
||||
@ -1,118 +0,0 @@
|
||||
; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
|
||||
; RUN: FileCheck %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
;
|
||||
; #include <cuda_runtime.h>
|
||||
;
|
||||
; static const int N = 45;
|
||||
;
|
||||
; void copy(int *R, int *A) {
|
||||
; for (int i = 0; i < N; i++) {
|
||||
; R[i] = A[i] * 10;
|
||||
; }
|
||||
; }
|
||||
;
|
||||
; int main() {
|
||||
; int *A, *R;
|
||||
;
|
||||
; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
|
||||
; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
|
||||
;
|
||||
; for (int i = 0; i < N; i++) {
|
||||
; A[i] = i;
|
||||
; R[i] = 0;
|
||||
; }
|
||||
; copy(R, A);
|
||||
;
|
||||
; return 0;
|
||||
; }
|
||||
;
|
||||
|
||||
; CHECK-NOT: polly_copyFromHostToDevice
|
||||
; CHECK-NOT: polly_copyFromDeviceToHost
|
||||
; CHECK-NOT: polly_freeDeviceMemory
|
||||
; CHECK-NOT: polly_allocateMemoryForDevice
|
||||
|
||||
; CHECK: %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
|
||||
; CHECK-NEXT: %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
|
||||
; CHECK-NEXT: %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
|
||||
; CHECK-NEXT: %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
|
||||
; CHECK-NEXT: store i8* %[[REGCA]], i8** %polly_launch_0_param_0
|
||||
; CHECK-NEXT: %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
|
||||
; CHECK-NEXT: store i8* %[[REGCP0]], i8** %[[REGGEP0]]
|
||||
; CHECK-NEXT: %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
|
||||
; CHECK-NEXT: store i8* %[[REGCR]], i8** %polly_launch_0_param_1
|
||||
; CHECK-NEXT: %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
|
||||
; CHECK-NEXT: store i8* %[[REGCP1]], i8** %[[REGGEP1]]
|
||||
; CHECK-NEXT: %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
|
||||
; CHECK-NEXT: call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
|
||||
; CHECK-NEXT: call void @polly_freeKernel(i8* %[[REGKERNEL]])
|
||||
; CHECK-NEXT: call void @polly_synchronizeDevice()
|
||||
; CHECK-NEXT: call void @polly_freeContext(i8* %[[REGCTX]])
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @copy(i32* %R, i32* %A) {
|
||||
entry:
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
|
||||
%exitcond = icmp ne i64 %indvars.iv, 45
|
||||
br i1 %exitcond, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
|
||||
%tmp = load i32, i32* %arrayidx, align 4
|
||||
%mul = mul nsw i32 %tmp, 10
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
|
||||
store i32 %mul, i32* %arrayidx2, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @main() {
|
||||
entry:
|
||||
%A = alloca i32*, align 8
|
||||
%R = alloca i32*, align 8
|
||||
%tmp = bitcast i32** %A to i8**
|
||||
%call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
|
||||
%tmp1 = bitcast i32** %R to i8**
|
||||
%call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
|
||||
%exitcond = icmp ne i64 %indvars.iv, 45
|
||||
br i1 %exitcond, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%tmp2 = load i32*, i32** %A, align 8
|
||||
%arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
|
||||
%tmp3 = trunc i64 %indvars.iv to i32
|
||||
store i32 %tmp3, i32* %arrayidx, align 4
|
||||
%tmp4 = load i32*, i32** %R, align 8
|
||||
%arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
|
||||
store i32 0, i32* %arrayidx3, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%tmp5 = load i32*, i32** %R, align 8
|
||||
%tmp6 = load i32*, i32** %A, align 8
|
||||
call void @copy(i32* %tmp5, i32* %tmp6)
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
declare i32 @cudaMallocManaged(i8**, i64, i32) #1
|
||||
@ -1,104 +0,0 @@
|
||||
; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
|
||||
; RUN: | FileCheck --check-prefix=KERNEL-IR %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 {
|
||||
|
||||
; The instruction marked <<<LeakyInst>>> is copied into the GPUModule,
|
||||
; with changes only to the parameters to access data on the device instead of
|
||||
; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the
|
||||
; instruction is annotated with a DILocation, copying the instruction also copies
|
||||
; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by
|
||||
; failing the verification of the Module in GPUNodeBuilder::finalize, due to the
|
||||
; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied
|
||||
; nor created.
|
||||
;
|
||||
; https://reviews.llvm.org/D35630 removes this debug metadata before the
|
||||
; instruction is copied to the GPUModule.
|
||||
;
|
||||
; vec_add_1.c:
|
||||
; void vec_add_1(int N, int arr[N]) {
|
||||
; int i=0;
|
||||
; for( i=0 ; i<N ; i++) arr[i] += 1;
|
||||
; }
|
||||
;
|
||||
source_filename = "vec_add_1.c"
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define void @vec_add_1(i32 %N, ptr %arr) !dbg !7 {
|
||||
entry:
|
||||
call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17
|
||||
call void @llvm.dbg.value(metadata ptr %arr, i64 0, metadata !14, metadata !16), !dbg !18
|
||||
call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
|
||||
%tmp = sext i32 %N to i64, !dbg !20
|
||||
br label %for.cond, !dbg !20
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
|
||||
call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19
|
||||
%cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22
|
||||
br i1 %cmp, label %for.body, label %for.end, !dbg !24
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv, !dbg !25
|
||||
%tmp1 = load i32, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
|
||||
%add = add nsw i32 %tmp1, 1, !dbg !26 ; <<<LeakyInst>>>
|
||||
store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
|
||||
br label %for.inc, !dbg !25
|
||||
|
||||
for.inc: ; preds = %for.body
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31
|
||||
call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19
|
||||
br label %for.cond, !dbg !32, !llvm.loop !33
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
ret void, !dbg !35
|
||||
}
|
||||
|
||||
declare void @llvm.dbg.declare(metadata, metadata, metadata)
|
||||
|
||||
declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
|
||||
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!3, !4, !5}
|
||||
!llvm.ident = !{!6}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
|
||||
!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp")
|
||||
!2 = !{}
|
||||
!3 = !{i32 2, !"Dwarf Version", i32 4}
|
||||
!4 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!5 = !{i32 1, !"wchar_size", i32 4}
|
||||
!6 = !{!"clang version 5.0.0"}
|
||||
!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!8 = !DISubroutineType(types: !9)
|
||||
!9 = !{null, !10, !11}
|
||||
!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
|
||||
!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
|
||||
!12 = !{!13, !14, !15}
|
||||
!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10)
|
||||
!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11)
|
||||
!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10)
|
||||
!16 = !DIExpression()
|
||||
!17 = !DILocation(line: 1, column: 20, scope: !7)
|
||||
!18 = !DILocation(line: 1, column: 27, scope: !7)
|
||||
!19 = !DILocation(line: 2, column: 7, scope: !7)
|
||||
!20 = !DILocation(line: 3, column: 8, scope: !21)
|
||||
!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
|
||||
!22 = !DILocation(line: 3, column: 15, scope: !23)
|
||||
!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3)
|
||||
!24 = !DILocation(line: 3, column: 3, scope: !21)
|
||||
!25 = !DILocation(line: 3, column: 25, scope: !23)
|
||||
!26 = !DILocation(line: 3, column: 32, scope: !23)
|
||||
!27 = !{!28, !28, i64 0}
|
||||
!28 = !{!"int", !29, i64 0}
|
||||
!29 = !{!"omnipotent char", !30, i64 0}
|
||||
!30 = !{!"Simple C/C++ TBAA"}
|
||||
!31 = !DILocation(line: 3, column: 21, scope: !23)
|
||||
!32 = !DILocation(line: 3, column: 3, scope: !23)
|
||||
!33 = distinct !{!33, !24, !34}
|
||||
!34 = !DILocation(line: 3, column: 35, scope: !21)
|
||||
!35 = !DILocation(line: 4, column: 1, scope: !7)
|
||||
@ -1,254 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=SCHED %s
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=CODE %s
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
|
||||
; RUN: FileCheck %s -check-prefix=IR
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck %s -check-prefix=KERNEL-IR
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck %s -check-prefix=KERNEL-ASM
|
||||
|
||||
; XFAIL: *
|
||||
|
||||
; REQUIRES: pollyacc, target=nvptx{{.*}}
|
||||
|
||||
; This fails today due to extensive output differences from when the test was written.
|
||||
|
||||
; CHECK: Stmt_bb5
|
||||
; CHECK-NEXT: Domain :=
|
||||
; CHECK-NEXT: { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
|
||||
; CHECK-NEXT: Schedule :=
|
||||
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> [i0, i1] };
|
||||
; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
|
||||
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
|
||||
; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0]
|
||||
; CHECK-NEXT: { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
|
||||
|
||||
; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: context: "{ [] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: sequence:
|
||||
; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: set:
|
||||
; SCHED-NEXT: - filter: "{ to_device_MemRef_A[] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: guard: "{ [] }"
|
||||
; SCHED-NEXT: - filter: "{ Stmt_bb5[i0, i1] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: guard: "{ [] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: mark: "kernel"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
|
||||
; SCHED-NEXT: permutable: 1
|
||||
; SCHED-NEXT: coincident: [ 1, 1 ]
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
|
||||
; SCHED-NEXT: permutable: 1
|
||||
; SCHED-NEXT: coincident: [ 1, 1 ]
|
||||
; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: set:
|
||||
; SCHED-NEXT: - filter: "{ from_device_MemRef_A[] }"
|
||||
; SCHED-NEXT: child:
|
||||
; SCHED-NEXT: guard: "{ [] }"
|
||||
|
||||
; CODE: Code
|
||||
; CODE-NEXT: ====
|
||||
; CODE-NEXT: # host
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(16, 32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(32, 32);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: # kernel0
|
||||
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
|
||||
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
|
||||
|
||||
; IR: polly.split_new_and_old:
|
||||
; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
|
||||
; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1
|
||||
; IR-NEXT: %polly.overflow.state = or i1 false, %.obit
|
||||
; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0
|
||||
; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
|
||||
; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1
|
||||
; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
|
||||
; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0
|
||||
; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
|
||||
; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1
|
||||
; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
|
||||
; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0
|
||||
; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
|
||||
; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1
|
||||
; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
|
||||
; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0
|
||||
; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440
|
||||
; IR-NEXT: %5 = and i1 true, %4
|
||||
; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
|
||||
; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown
|
||||
; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2
|
||||
|
||||
; IR: polly.start:
|
||||
; IR-NEXT: br label %polly.acc.initialize
|
||||
|
||||
; IR: polly.acc.initialize:
|
||||
; IR-NEXT: [[GPUContext:%.*]] = call ptr @polly_initContext()
|
||||
; IR-NEXT: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304)
|
||||
; IR-NEXT: call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304)
|
||||
; IR-NEXT: [[DevPtr:%.*]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A)
|
||||
; IR-NEXT: store ptr [[DevPtr]], ptr %polly_launch_0_param_0
|
||||
; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params
|
||||
; IR-NEXT: call ptr @polly_getKernel
|
||||
; IR-NEXT: call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr)
|
||||
; IR-NEXT: call void @polly_freeKernel
|
||||
; IR-NEXT: call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304)
|
||||
; IR-NEXT: call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A)
|
||||
; IR-NEXT: call void @polly_freeContext(ptr [[GPUContext]])
|
||||
; IR-NEXT: br label %polly.exiting
|
||||
|
||||
; IR: polly.exiting:
|
||||
; IR-NEXT: br label %polly.merge_new_and_old
|
||||
|
||||
; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 {
|
||||
; KERNEL-IR-NEXT: entry:
|
||||
; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64
|
||||
; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
; KERNEL-IR-NEXT: %b1 = zext i32 %1 to i64
|
||||
; KERNEL-IR-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
; KERNEL-IR-NEXT: %t0 = zext i32 %2 to i64
|
||||
; KERNEL-IR-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
; KERNEL-IR-NEXT: %t1 = zext i32 %3 to i64
|
||||
; KERNEL-IR-NEXT: br label %polly.loop_preheader
|
||||
|
||||
; KERNEL-IR-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5
|
||||
; KERNEL-IR-NEXT: ret void
|
||||
|
||||
; KERNEL-IR-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader
|
||||
; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
|
||||
; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0
|
||||
; KERNEL-IR-NEXT: %6 = mul nsw i64 32, %b1
|
||||
; KERNEL-IR-NEXT: %7 = add nsw i64 %6, %t1
|
||||
; KERNEL-IR-NEXT: %8 = mul nsw i64 16, %polly.indvar
|
||||
; KERNEL-IR-NEXT: %9 = add nsw i64 %7, %8
|
||||
; KERNEL-IR-NEXT: br label %polly.stmt.bb5
|
||||
|
||||
; KERNEL-IR-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header
|
||||
; KERNEL-IR-NEXT: %10 = mul i64 %5, %9
|
||||
; KERNEL-IR-NEXT: %p_tmp6 = sitofp i64 %10 to float
|
||||
; KERNEL-IR-NEXT: %11 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %12 = add nsw i64 %11, %t0
|
||||
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
|
||||
; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b1
|
||||
; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t1
|
||||
; KERNEL-IR-NEXT: %15 = mul nsw i64 16, %polly.indvar
|
||||
; KERNEL-IR-NEXT: %16 = add nsw i64 %14, %15
|
||||
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
|
||||
; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A
|
||||
; KERNEL-IR-NEXT: %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4
|
||||
; KERNEL-IR-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
|
||||
; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0
|
||||
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
|
||||
; KERNEL-IR-NEXT: %19 = mul nsw i64 32, %b1
|
||||
; KERNEL-IR-NEXT: %20 = add nsw i64 %19, %t1
|
||||
; KERNEL-IR-NEXT: %21 = mul nsw i64 16, %polly.indvar
|
||||
; KERNEL-IR-NEXT: %22 = add nsw i64 %20, %21
|
||||
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
|
||||
; KERNEL-IR-NEXT: %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3
|
||||
; KERNEL-IR-NEXT: store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4
|
||||
; KERNEL-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
|
||||
; KERNEL-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 0
|
||||
; KERNEL-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
|
||||
|
||||
; KERNEL-IR-LABEL: polly.loop_preheader: ; preds = %entry
|
||||
; KERNEL-IR-NEXT: br label %polly.loop_header
|
||||
|
||||
; KERNEL-IR: attributes #0 = { "polly.skip.fn" }
|
||||
|
||||
; KERNEL-ASM: .version 3.2
|
||||
; KERNEL-ASM-NEXT: .target sm_30
|
||||
; KERNEL-ASM-NEXT: .address_size 64
|
||||
|
||||
; KERNEL-ASM: // .globl kernel_0
|
||||
|
||||
; KERNEL-ASM: .visible .entry kernel_0(
|
||||
; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0
|
||||
; KERNEL-ASM-NEXT: )
|
||||
|
||||
; void double_parallel_loop(float A[][1024]) {
|
||||
; for (long i = 0; i < 1024; i++)
|
||||
; for (long j = 0; j < 1024; j++)
|
||||
; A[i][j] += i * j;
|
||||
; }
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @double_parallel_loop(ptr %A) {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb2: ; preds = %bb13, %bb
|
||||
%i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
|
||||
%exitcond1 = icmp ne i64 %i.0, 1024
|
||||
br i1 %exitcond1, label %bb3, label %bb15
|
||||
|
||||
bb3: ; preds = %bb2
|
||||
br label %bb4
|
||||
|
||||
bb4: ; preds = %bb10, %bb3
|
||||
%j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
|
||||
%exitcond = icmp ne i64 %j.0, 1024
|
||||
br i1 %exitcond, label %bb5, label %bb12
|
||||
|
||||
bb5: ; preds = %bb4
|
||||
%tmp = mul nuw nsw i64 %i.0, %j.0
|
||||
%tmp6 = sitofp i64 %tmp to float
|
||||
%tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0
|
||||
%tmp8 = load float, ptr %tmp7, align 4
|
||||
%tmp9 = fadd float %tmp8, %tmp6
|
||||
store float %tmp9, ptr %tmp7, align 4
|
||||
br label %bb10
|
||||
|
||||
bb10: ; preds = %bb5
|
||||
%tmp11 = add nuw nsw i64 %j.0, 1
|
||||
br label %bb4
|
||||
|
||||
bb12: ; preds = %bb4
|
||||
br label %bb13
|
||||
|
||||
bb13: ; preds = %bb12
|
||||
%tmp14 = add nuw nsw i64 %i.0, 1
|
||||
br label %bb2
|
||||
|
||||
bb15: ; preds = %bb2
|
||||
ret void
|
||||
}
|
||||
@ -1,57 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS
|
||||
; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
|
||||
|
||||
%S = type { i32, i32, [12 x %L] }
|
||||
%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
|
||||
|
||||
define void @test(ptr %cpi, i1 %b) {
|
||||
; SCOPS-LABEL: Region: %if.then14---%exit
|
||||
; SCOPS: Invariant Accesses: {
|
||||
; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
|
||||
; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] };
|
||||
; SCOPS-NEXT: Execution Context: [l2, l1] -> { : }
|
||||
; SCOPS-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
|
||||
; SCOPS-NEXT: [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] };
|
||||
; SCOPS-NEXT: Execution Context: [l2, l1] -> { : l2 > 0 }
|
||||
; SCOPS-NEXT: }
|
||||
; SCOPS: Arrays {
|
||||
; SCOPS-NEXT: i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4
|
||||
; SCOPS-NEXT: }
|
||||
|
||||
; Check that we gracefully handle failing invariant loads.
|
||||
; This test case is taken from:
|
||||
; test/Isl/CodeGen/invariant-load-dimension.ll
|
||||
|
||||
; FIXME: Figure out how to actually generate code for this loop.
|
||||
; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function
|
||||
|
||||
entry:
|
||||
%nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
|
||||
br i1 %b, label %if.then14, label %exit
|
||||
|
||||
if.then14:
|
||||
%l0 = load i32, ptr %cpi, align 8
|
||||
%cmp12.i = icmp sgt i32 %l0, 0
|
||||
br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
|
||||
|
||||
for.body.lr.ph.i:
|
||||
%l1 = load i32, ptr %nt, align 4
|
||||
br label %for.body.i
|
||||
|
||||
for.body.i:
|
||||
%phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
|
||||
%mul.i163 = mul nsw i32 %phi, %l1
|
||||
%cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
|
||||
store i32 0, ptr %cv, align 8
|
||||
%inc = add nuw nsw i32 %phi, 1
|
||||
%l2 = load i32, ptr %cpi, align 8
|
||||
%cmp.i164 = icmp slt i32 %inc, %l2
|
||||
br i1 %cmp.i164, label %for.body.i, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
@ -1,41 +0,0 @@
|
||||
; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \
|
||||
; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
|
||||
|
||||
%S = type { i32, i32, [12 x %L] }
|
||||
%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
|
||||
|
||||
define void @test(ptr %cpi, i1 %b) {
|
||||
; CODEGEN-LABEL: @test(
|
||||
; CODEGEN: polly.preload.begin:
|
||||
; CODEGEN-NEXT: br i1 false
|
||||
|
||||
entry:
|
||||
%nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
|
||||
br i1 %b, label %if.then14, label %exit
|
||||
|
||||
if.then14:
|
||||
%l0 = load i32, ptr %cpi, align 8
|
||||
%cmp12.i = icmp sgt i32 %l0, 0
|
||||
br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
|
||||
|
||||
for.body.lr.ph.i:
|
||||
%l1 = load i32, ptr %nt, align 4
|
||||
br label %for.body.i
|
||||
|
||||
for.body.i:
|
||||
%phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
|
||||
%mul.i163 = mul nsw i32 %phi, %l1
|
||||
%cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
|
||||
store i32 0, ptr %cv, align 8
|
||||
%inc = add nuw nsw i32 %phi, 1
|
||||
%l2 = load i32, ptr %cpi, align 8
|
||||
%cmp.i164 = icmp slt i32 %inc, %l2
|
||||
br i1 %cmp.i164, label %for.body.i, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
@ -1,176 +0,0 @@
|
||||
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
|
||||
; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
|
||||
|
||||
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
|
||||
; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR
|
||||
|
||||
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
|
||||
; RUN: -S < %s | FileCheck %s -check-prefix=IR
|
||||
; void foo(float A[2][100]) {
|
||||
; for (long t = 0; t < 100; t++)
|
||||
; for (long i = 1; i < 99; i++)
|
||||
; A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
|
||||
; }
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1)
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(4);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
|
||||
; CODE-NEXT: }
|
||||
|
||||
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
|
||||
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
|
||||
; ...
|
||||
; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1
|
||||
; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
|
||||
; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
|
||||
; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]
|
||||
; IR: call i8* @polly_getKernel
|
||||
; ...
|
||||
; IR: call void @polly_freeKernel
|
||||
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
|
||||
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
|
||||
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
|
||||
|
||||
; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
|
||||
; KERNEL-IR-LABEL: entry:
|
||||
; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64
|
||||
; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
; KERNEL-IR-NEXT: %t0 = zext i32 %1 to i64
|
||||
; KERNEL-IR-NEXT: br label %polly.cond
|
||||
|
||||
; KERNEL-IR-LABEL: polly.cond: ; preds = %entry
|
||||
; KERNEL-IR-NEXT: %2 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %3 = add nsw i64 %2, %t0
|
||||
; KERNEL-IR-NEXT: %4 = icmp sle i64 %3, 97
|
||||
; KERNEL-IR-NEXT: br i1 %4, label %polly.then, label %polly.else
|
||||
|
||||
; KERNEL-IR-LABEL: polly.merge: ; preds = %polly.else, %polly.stmt.for.body3
|
||||
; KERNEL-IR-NEXT: ret void
|
||||
|
||||
; KERNEL-IR-LABEL: polly.then: ; preds = %polly.cond
|
||||
; KERNEL-IR-NEXT: %5 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %6 = add nsw i64 %5, %t0
|
||||
; KERNEL-IR-NEXT: br label %polly.stmt.for.body3
|
||||
|
||||
; KERNEL-IR-LABEL: polly.stmt.for.body3: ; preds = %polly.then
|
||||
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
|
||||
; KERNEL-IR-NEXT: %pexp.pdiv_r = urem i64 %c0, 2
|
||||
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
|
||||
; KERNEL-IR-NEXT: %7 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %8 = add nsw i64 %7, %t0
|
||||
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
|
||||
; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
|
||||
; KERNEL-IR-NEXT: %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
|
||||
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
|
||||
; KERNEL-IR-NEXT: %pexp.pdiv_r2 = urem i64 %c0, 2
|
||||
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
|
||||
; KERNEL-IR-NEXT: %9 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %10 = add nsw i64 %9, %t0
|
||||
; KERNEL-IR-NEXT: %11 = add nsw i64 %10, 1
|
||||
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11
|
||||
; KERNEL-IR-NEXT: %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
|
||||
; KERNEL-IR-NEXT: %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4
|
||||
; KERNEL-IR-NEXT: %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
|
||||
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
|
||||
; KERNEL-IR-NEXT: %pexp.pdiv_r7 = urem i64 %c0, 2
|
||||
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
|
||||
; KERNEL-IR-NEXT: %12 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %13 = add nsw i64 %12, %t0
|
||||
; KERNEL-IR-NEXT: %14 = add nsw i64 %13, 2
|
||||
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14
|
||||
; KERNEL-IR-NEXT: %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
|
||||
; KERNEL-IR-NEXT: %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4
|
||||
; KERNEL-IR-NEXT: %p_add12 = fadd float %p_add, %tmp3_p_scalar_
|
||||
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
|
||||
; KERNEL-IR-NEXT: %15 = add nsw i64 %c0, 1
|
||||
; KERNEL-IR-NEXT: %pexp.pdiv_r12 = urem i64 %15, 2
|
||||
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
|
||||
; KERNEL-IR-NEXT: %16 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %17 = add nsw i64 %16, %t0
|
||||
; KERNEL-IR-NEXT: %18 = add nsw i64 %17, 1
|
||||
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18
|
||||
; KERNEL-IR-NEXT: %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
|
||||
; KERNEL-IR-NEXT: %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4
|
||||
; KERNEL-IR-NEXT: %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
|
||||
; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
|
||||
; KERNEL-IR-NEXT: %19 = add nsw i64 %c0, 1
|
||||
; KERNEL-IR-NEXT: %pexp.pdiv_r17 = urem i64 %19, 2
|
||||
; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
|
||||
; KERNEL-IR-NEXT: %20 = mul nsw i64 32, %b0
|
||||
; KERNEL-IR-NEXT: %21 = add nsw i64 %20, %t0
|
||||
; KERNEL-IR-NEXT: %22 = add nsw i64 %21, 1
|
||||
; KERNEL-IR-NEXT: %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22
|
||||
; KERNEL-IR-NEXT: %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
|
||||
; KERNEL-IR-NEXT: store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4
|
||||
; KERNEL-IR-NEXT: br label %polly.merge
|
||||
|
||||
; KERNEL-IR-LABEL: polly.else: ; preds = %polly.cond
|
||||
; KERNEL-IR-NEXT: br label %polly.merge
|
||||
; KERNEL-IR-NEXT: }
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @foo([100 x float]* %A) {
|
||||
entry:
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc18, %entry
|
||||
%t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
|
||||
%exitcond1 = icmp ne i64 %t.0, 100
|
||||
br i1 %exitcond1, label %for.body, label %for.end20
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
br label %for.cond1
|
||||
|
||||
for.cond1: ; preds = %for.inc, %for.body
|
||||
%i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
|
||||
%exitcond = icmp ne i64 %i.0, 99
|
||||
br i1 %exitcond, label %for.body3, label %for.end
|
||||
|
||||
for.body3: ; preds = %for.cond1
|
||||
%sub = add nsw i64 %i.0, -1
|
||||
%rem = srem i64 %t.0, 2
|
||||
%arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
|
||||
%tmp = load float, float* %arrayidx4, align 4
|
||||
%rem5 = srem i64 %t.0, 2
|
||||
%arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
|
||||
%tmp2 = load float, float* %arrayidx7, align 4
|
||||
%add = fadd float %tmp, %tmp2
|
||||
%add8 = add nuw nsw i64 %i.0, 1
|
||||
%rem9 = srem i64 %t.0, 2
|
||||
%arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
|
||||
%tmp3 = load float, float* %arrayidx11, align 4
|
||||
%add12 = fadd float %add, %tmp3
|
||||
%add13 = add nuw nsw i64 %t.0, 1
|
||||
%rem14 = srem i64 %add13, 2
|
||||
%arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
|
||||
%tmp4 = load float, float* %arrayidx16, align 4
|
||||
%add17 = fadd float %tmp4, %add12
|
||||
store float %add17, float* %arrayidx16, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body3
|
||||
%inc = add nuw nsw i64 %i.0, 1
|
||||
br label %for.cond1
|
||||
|
||||
for.end: ; preds = %for.cond1
|
||||
br label %for.inc18
|
||||
|
||||
for.inc18: ; preds = %for.end
|
||||
%inc19 = add nuw nsw i64 %t.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end20: ; preds = %for.cond
|
||||
ret void
|
||||
}
|
||||
@ -1,204 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
|
||||
; RUN: -polly-invariant-load-hoisting=false \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=CODE %s
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
|
||||
; RUN: -polly-invariant-load-hoisting=false \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=KERNEL-IR %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
declare void @llvm.lifetime.start(i64, ptr nocapture) #0
|
||||
|
||||
; This test case tests that we can correctly handle a ScopStmt that is
|
||||
; scheduled on the host, instead of within a kernel.
|
||||
|
||||
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(16);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: if (p_0 <= 510 && p_1 <= 510) {
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k1_dimBlock(32);
|
||||
; CODE-NEXT: dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
|
||||
; CODE-NEXT: kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: {
|
||||
; CODE-NEXT: dim3 k2_dimBlock(16, 32);
|
||||
; CODE-NEXT: dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
|
||||
; CODE-NEXT: kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: }
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
|
||||
; CODE-NEXT: Stmt_for_cond33_preheader_last();
|
||||
|
||||
; CODE: }
|
||||
|
||||
; CODE: # kernel0
|
||||
; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
|
||||
|
||||
; CODE: # kernel1
|
||||
; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
|
||||
; CODE-NEXT: for (int c1 = 0; c1 <= 15; c1 += 1) {
|
||||
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0)
|
||||
; CODE-NEXT: Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
|
||||
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510)
|
||||
; CODE-NEXT: for (int c3 = 0; c3 <= 31; c3 += 1)
|
||||
; CODE-NEXT: Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
|
||||
; CODE-NEXT: sync0();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: # kernel2
|
||||
; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
|
||||
; CODE-NEXT: if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
|
||||
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
|
||||
; CODE-NEXT: Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
|
||||
|
||||
; KERNEL-IR: call void @llvm.nvvm.barrier0()
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 {
|
||||
entry:
|
||||
br label %entry.split
|
||||
|
||||
entry.split: ; preds = %entry
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %entry.split, %for.inc86
|
||||
%indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
|
||||
%indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.cond1.preheader, %for.inc
|
||||
%indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
|
||||
%nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
|
||||
%arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
|
||||
%tmp = load double, ptr %arrayidx5, align 8, !tbaa !1
|
||||
%arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
|
||||
%tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1
|
||||
%mul = fmul double %tmp, %tmp27
|
||||
%add = fadd double %nrm.02, %mul
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 512
|
||||
br i1 %exitcond, label %for.inc, label %for.end
|
||||
|
||||
for.end: ; preds = %for.inc
|
||||
%add.lcssa = phi double [ %add, %for.inc ]
|
||||
%call = tail call double @sqrt(double %add.lcssa) #2
|
||||
%arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
|
||||
store double %call, ptr %arrayidx13, align 8, !tbaa !1
|
||||
br label %for.body16
|
||||
|
||||
for.cond33.preheader: ; preds = %for.body16
|
||||
%indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
|
||||
%cmp347 = icmp slt i64 %indvars.iv.next25, 512
|
||||
br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
|
||||
|
||||
for.body35.lr.ph: ; preds = %for.cond33.preheader
|
||||
br label %for.body35
|
||||
|
||||
for.body16: ; preds = %for.end, %for.body16
|
||||
%indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
|
||||
%arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24
|
||||
%tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1
|
||||
%arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
|
||||
%tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1
|
||||
%div = fdiv double %tmp28, %tmp29
|
||||
%arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24
|
||||
store double %div, ptr %arrayidx28, align 8, !tbaa !1
|
||||
%indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
|
||||
%exitcond12 = icmp ne i64 %indvars.iv.next11, 512
|
||||
br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
|
||||
|
||||
for.cond33.loopexit: ; preds = %for.body62
|
||||
%indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
|
||||
%exitcond23 = icmp ne i32 %lftr.wideiv, 512
|
||||
br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
|
||||
|
||||
for.body35: ; preds = %for.body35.lr.ph, %for.cond33.loopexit
|
||||
%indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
|
||||
%arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
|
||||
store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1
|
||||
br label %for.body42
|
||||
|
||||
for.cond60.preheader: ; preds = %for.body42
|
||||
br label %for.body62
|
||||
|
||||
for.body42: ; preds = %for.body35, %for.body42
|
||||
%indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
|
||||
%arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24
|
||||
%tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1
|
||||
%arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21
|
||||
%tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1
|
||||
%mul51 = fmul double %tmp30, %tmp31
|
||||
%arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
|
||||
%tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1
|
||||
%add56 = fadd double %tmp32, %mul51
|
||||
store double %add56, ptr %arrayidx55, align 8, !tbaa !1
|
||||
%indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
|
||||
%exitcond15 = icmp ne i64 %indvars.iv.next14, 512
|
||||
br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
|
||||
|
||||
for.body62: ; preds = %for.cond60.preheader, %for.body62
|
||||
%indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
|
||||
%arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
|
||||
%tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1
|
||||
%arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24
|
||||
%tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1
|
||||
%arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
|
||||
%tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1
|
||||
%mul75 = fmul double %tmp34, %tmp35
|
||||
%sub = fsub double %tmp33, %mul75
|
||||
%arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
|
||||
store double %sub, ptr %arrayidx79, align 8, !tbaa !1
|
||||
%indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
|
||||
%exitcond18 = icmp ne i64 %indvars.iv.next17, 512
|
||||
br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
|
||||
|
||||
for.cond33.for.inc86_crit_edge: ; preds = %for.cond33.loopexit
|
||||
br label %for.inc86
|
||||
|
||||
for.inc86: ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
|
||||
%indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
|
||||
%exitcond26 = icmp ne i64 %indvars.iv.next25, 512
|
||||
br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
|
||||
|
||||
for.end88: ; preds = %for.inc86
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.lifetime.end(i64, ptr nocapture) #0
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare double @sqrt(double) #2
|
||||
|
||||
attributes #0 = { argmemonly nounwind }
|
||||
attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind }
|
||||
|
||||
!llvm.ident = !{!0}
|
||||
|
||||
!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
|
||||
!1 = !{!2, !2, i64 0}
|
||||
!2 = !{!"double", !3, i64 0}
|
||||
!3 = !{!"omnipotent char", !4, i64 0}
|
||||
!4 = !{!"Simple C/C++ TBAA"}
|
||||
@ -1,41 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=CODE %s
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; CODE: Code
|
||||
; CODE: ====
|
||||
; CODE: No code generated
|
||||
|
||||
source_filename = "bugpoint-output-83bcdeb.bc"
|
||||
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
@__data_radiation_MOD_cobi = external global [168 x double], align 32
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define void @__radiation_rg_MOD_coe_so() #0 {
|
||||
entry:
|
||||
%polly.access.kspec.load = load i32, ptr undef, align 4
|
||||
%0 = or i1 undef, undef
|
||||
br label %polly.preload.cond29
|
||||
|
||||
polly.preload.cond29: ; preds = %entry
|
||||
br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30
|
||||
|
||||
polly.preload.merge30: ; preds = %polly.preload.exec31, %polly.preload.cond29
|
||||
%polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ]
|
||||
ret void
|
||||
|
||||
polly.preload.exec31: ; preds = %polly.preload.cond29
|
||||
%1 = sext i32 %polly.access.kspec.load to i64
|
||||
%2 = mul nsw i64 7, %1
|
||||
%3 = add nsw i64 0, %2
|
||||
%4 = add nsw i64 %3, 48
|
||||
%polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4
|
||||
%polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8
|
||||
br label %polly.preload.merge30
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind uwtable }
|
||||
@ -1,76 +0,0 @@
|
||||
; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
|
||||
; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR
|
||||
; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR
|
||||
|
||||
; Test that we do recognise and codegen a kernel that has intrinsics.
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; Check that we model the kernel as a scop.
|
||||
; SCOP: Function: f
|
||||
; SCOP-NEXT: Region: %entry.split---%for.end
|
||||
|
||||
; Check that the intrinsic call is present in the kernel IR.
|
||||
; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
|
||||
; KERNEL-IR: declare float @llvm.sqrt.f32(float)
|
||||
; KERNEL-IR: declare float @llvm.fabs.f32(float)
|
||||
|
||||
|
||||
; Check that kernel launch is generated in host IR.
|
||||
; the declare would not be generated unless a call to a kernel exists.
|
||||
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
|
||||
|
||||
|
||||
; void f(float *A, float *B, int N) {
|
||||
; for(int i = 0; i < N; i++) {
|
||||
; float tmp0 = A[i];
|
||||
; float tmp1 = sqrt(tmp1);
|
||||
; float tmp2 = fabs(tmp2);
|
||||
; float tmp3 = copysignf(tmp1, tmp2);
|
||||
; B[i] = tmp4;
|
||||
; }
|
||||
; }
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @f(float* %A, float* %B, i32 %N) {
|
||||
entry:
|
||||
br label %entry.split
|
||||
|
||||
entry.split: ; preds = %entry
|
||||
%cmp1 = icmp sgt i32 %N, 0
|
||||
br i1 %cmp1, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph: ; preds = %entry.split
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.lr.ph, %for.body
|
||||
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
|
||||
%A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
|
||||
%A.arr.i.val = load float, float* %A.arr.i, align 4
|
||||
; Call to intrinsics that should be part of the kernel.
|
||||
%sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
|
||||
%fabs = tail call float @llvm.fabs.f32(float %sqrt);
|
||||
%copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
|
||||
%B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
|
||||
store float %copysign, float* %B.arr.i, align 4
|
||||
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%wide.trip.count = zext i32 %N to i64
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
|
||||
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
|
||||
|
||||
for.cond.for.end_crit_edge: ; preds = %for.body
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.sqrt.f32(float) #0
|
||||
declare float @llvm.fabs.f32(float) #0
|
||||
declare float @llvm.copysign.f32(float, float) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
||||
@ -1,47 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \
|
||||
; RUN: -disable-output < %s
|
||||
|
||||
; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually
|
||||
; fail on an illegal module.
|
||||
|
||||
; REQUIRES: pollyacc, asserts
|
||||
; XFAIL: *
|
||||
;
|
||||
; void foo(long A[1024], long B[1024]) {
|
||||
; for (long i = 0; i < 1024; i++)
|
||||
; A[i] += (B[i] + (long)&B[i]);
|
||||
; }
|
||||
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @foo(ptr %A, ptr %B) {
|
||||
bb:
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb10, %bb
|
||||
%i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
|
||||
%exitcond = icmp ne i64 %i.0, 1024
|
||||
br i1 %exitcond, label %bb2, label %bb12
|
||||
|
||||
bb2: ; preds = %bb1
|
||||
%tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
|
||||
%tmp3 = load i64, ptr %tmp, align 8
|
||||
%tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
|
||||
%tmp5 = ptrtoint ptr %tmp4 to i64
|
||||
%tmp6 = add nsw i64 %tmp3, %tmp5
|
||||
%tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
|
||||
%tmp8 = load i64, ptr %tmp7, align 8
|
||||
%tmp9 = add nsw i64 %tmp8, %tmp6
|
||||
store i64 %tmp9, ptr %tmp7, align 8
|
||||
br label %bb10
|
||||
|
||||
bb10: ; preds = %bb2
|
||||
%tmp11 = add nuw nsw i64 %i.0, 1
|
||||
br label %bb1
|
||||
|
||||
bb12: ; preds = %bb1
|
||||
ret void
|
||||
}
|
||||
@ -1,73 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=CODE %s
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: not FileCheck %s -check-prefix=KERNEL-IR
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
|
||||
; RUN: FileCheck %s -check-prefix=IR
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
;
|
||||
; void foo(long A[1024], long B[1024]) {
|
||||
; for (long i = 0; i < 1024; i++)
|
||||
; A[i] += (B[i] + (long)&B[i]);
|
||||
; }
|
||||
|
||||
; This kernel loads/stores a pointer address we model. This is a rare case,
|
||||
; were we still lack proper code-generation support. We check here that we
|
||||
; detect the invalid IR and bail out gracefully.
|
||||
|
||||
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
|
||||
; CODE-NEXT: {
|
||||
; CODE-NEXT: dim3 k0_dimBlock(32);
|
||||
; CODE-NEXT: dim3 k0_dimGrid(32);
|
||||
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A);
|
||||
; CODE-NEXT: cudaCheckKernel();
|
||||
; CODE-NEXT: }
|
||||
|
||||
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
|
||||
|
||||
; CODE: # kernel0
|
||||
; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
|
||||
|
||||
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
|
||||
; RUN: FileCheck %s -check-prefix=IR
|
||||
|
||||
; KERNEL-IR: kernel
|
||||
|
||||
; IR: br i1 false, label %polly.start, label %bb1
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @foo(ptr %A, ptr %B) {
|
||||
bb:
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb10, %bb
|
||||
%i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
|
||||
%exitcond = icmp ne i64 %i.0, 1024
|
||||
br i1 %exitcond, label %bb2, label %bb12
|
||||
|
||||
bb2: ; preds = %bb1
|
||||
%tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
|
||||
%tmp3 = load i64, ptr %tmp, align 8
|
||||
%tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
|
||||
%tmp5 = ptrtoint ptr %tmp4 to i64
|
||||
%tmp6 = add nsw i64 %tmp3, %tmp5
|
||||
%tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
|
||||
%tmp8 = load i64, ptr %tmp7, align 8
|
||||
%tmp9 = add nsw i64 %tmp8, %tmp6
|
||||
store i64 %tmp9, ptr %tmp7, align 8
|
||||
br label %bb10
|
||||
|
||||
bb10: ; preds = %bb2
|
||||
%tmp11 = add nuw nsw i64 %i.0, 1
|
||||
br label %bb1
|
||||
|
||||
bb12: ; preds = %bb1
|
||||
ret void
|
||||
}
|
||||
@ -1,70 +0,0 @@
|
||||
; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
|
||||
|
||||
; RUN: opt %loadPolly -S -polly-codegen-ppcg \
|
||||
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
|
||||
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; Check that we detect a scop.
|
||||
; SCOP: Function: f
|
||||
; SCOP-NEXT: Region: %for.body---%for.end
|
||||
; SCOP-NEXT: Max Loop Depth: 1
|
||||
; SCOP-NEXT: Invariant Accesses: {
|
||||
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
|
||||
; SCOP-NEXT: [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] };
|
||||
; SCOP-NEXT: Execution Context: [tmp] -> { : }
|
||||
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
|
||||
; SCOP-NEXT: [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] };
|
||||
; SCOP-NEXT: Execution Context: [tmp] -> { : tmp >= 4 }
|
||||
; SCOP-NEXT: }
|
||||
|
||||
; Check that kernel launch is generated in host IR.
|
||||
; the declare would not be generated unless a call to a kernel exists.
|
||||
; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
|
||||
|
||||
; This test makes sure that such an access pattern is handled correctly
|
||||
; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
|
||||
; was the main reason that caused this test case to crash.
|
||||
;
|
||||
; void f(int *arr, const int *control, const int *readarr) {
|
||||
; for(int i = 0; i < 1000; i++) {
|
||||
; int t = 0;
|
||||
; if (*control > 3) {
|
||||
; t += *readarr;
|
||||
; }
|
||||
; arr[i] = t;
|
||||
; }
|
||||
; }
|
||||
|
||||
|
||||
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
|
||||
target triple = "i386-apple-macosx10.12.0"
|
||||
define void @f(ptr %arr, ptr %control, ptr %readarr) {
|
||||
entry:
|
||||
br label %entry.split
|
||||
|
||||
entry.split: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %entry.split, %if.end
|
||||
%i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ]
|
||||
%tmp = load i32, ptr %control, align 4
|
||||
%cmp1 = icmp sgt i32 %tmp, 3
|
||||
br i1 %cmp1, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %for.body
|
||||
%tmp1 = load i32, ptr %readarr, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %for.body
|
||||
%t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ]
|
||||
%arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01
|
||||
store i32 %t.0, ptr %arrayidx, align 4
|
||||
%inc = add nuw nsw i32 %i.01, 1
|
||||
%exitcond = icmp eq i32 %inc, 1000
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end: ; preds = %if.end
|
||||
ret void
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user