[Polly] Remove Polly-ACC.

Polly-ACC is unmaintained and since it has never been ported to the NPM pipeline, since D136621 it is not even accessible anymore without manually specifying the passes on the `opt` command line. Since there is no plan to put it to a maintainable state, remove it from Polly. Reviewed By: grosser Differential Revision: https://reviews.llvm.org/D142580
2023-01-25 14:03:57 -06:00 · 2023-01-25 14:03:57 -06:00 · 19afbfe331
commit 19afbfe331
parent 115c7beda7
166 changed files with 29 additions and 31394 deletions
--- a/polly/CMakeLists.txt
+++ b/polly/CMakeLists.txt
@ -85,31 +85,6 @@ endif ()

 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

-option(POLLY_ENABLE_GPGPU_CODEGEN "Enable GPGPU code generation feature" OFF)
-set(GPU_CODEGEN FALSE)
-if (POLLY_ENABLE_GPGPU_CODEGEN)
-  # Do not require CUDA/OpenCL, as GPU code generation test cases can be run
-  # without a CUDA/OpenCL library.
-  if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-    FIND_PACKAGE(CUDA)
-    FIND_PACKAGE(OpenCL)
-    set(GPU_CODEGEN TRUE)
-  else()
-    message(WARNING "The LLVM NVPTX target is required for GPU code generation")
-  endif()
-endif(POLLY_ENABLE_GPGPU_CODEGEN)
-
-
-# Support GPGPU code generation if the library is available.
-if (CUDA_FOUND)
-  add_definitions(-DHAS_LIBCUDART)
-  INCLUDE_DIRECTORIES( ${CUDA_INCLUDE_DIRS} )
-endif(CUDA_FOUND)
-if (OpenCL_FOUND)
-  add_definitions(-DHAS_LIBOPENCL)
-  INCLUDE_DIRECTORIES( ${OpenCL_INCLUDE_DIR} )
-endif(OpenCL_FOUND)
-
 option(POLLY_BUNDLED_ISL "Use the bundled version of libisl included in Polly" ON)
 if (NOT POLLY_BUNDLED_ISL)
  find_package(ISL MODULE REQUIRED)
@ -155,7 +130,6 @@ add_subdirectory(test)
 if (POLLY_GTEST_AVAIL)
  add_subdirectory(unittests)
 endif ()
-add_subdirectory(tools)
 add_subdirectory(cmake)
 # TODO: docs.

--- a/polly/cmake/CMakeLists.txt
+++ b/polly/cmake/CMakeLists.txt
@ -27,9 +27,6 @@ if (NOT WIN32 AND LLVM_ENABLE_PIC)
  # LLVMPolly is a dummy target on Win or if PIC code is disabled.
  list(APPEND POLLY_CONFIG_EXPORTED_TARGETS LLVMPolly)
 endif()
-if (POLLY_ENABLE_GPGPU_CODEGEN)
-  list(APPEND POLLY_CONFIG_EXPORTED_TARGETS PollyPPCG)
-endif()

 # Get the target type for every exported target
 foreach(tgt IN LISTS POLLY_CONFIG_EXPORTED_TARGETS)
--- a/polly/cmake/PollyConfig.cmake.in
+++ b/polly/cmake/PollyConfig.cmake.in
@ -8,7 +8,6 @@ find_package(LLVM ${LLVM_VERSION} EXACT REQUIRED CONFIG

 set(Polly_CMAKE_DIR ${CMAKE_CURRENT_LIST_DIR})
 set(Polly_BUNDLED_ISL @POLLY_BUNDLED_ISL@)
-set(Polly_ENABLE_GPGPU_CODEGEN @POLLY_ENABLE_GPGPU_CODEGEN@)

 set(Polly_DEFINITIONS ${LLVM_DEFINITIONS})
 set(Polly_INCLUDE_DIRS @POLLY_CONFIG_INCLUDE_DIRS@ ${LLVM_INCLUDE_DIRS})
@ -19,17 +18,9 @@ set(Polly_LIBRARIES ${LLVM_LIBRARIES} ${Polly_EXPORTED_TARGETS})
 # Imported Targets:
@ISL_CONFIG_CODE@

-if (Polly_ENABLE_GPGPU_CODEGEN AND NOT TARGET PollyPPCG)
-  add_library(PollyPPCG @POLLY_CONFIG_TARGET_PollyPPCG_TYPE@ IMPORTED)
-  set_property(TARGET PollyPPCG PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
-endif()
-
 if (NOT TARGET Polly)
  add_library(Polly @POLLY_CONFIG_TARGET_Polly_TYPE@ IMPORTED)
  set_property(TARGET Polly PROPERTY INTERFACE_LINK_LIBRARIES @ISL_TARGET@)
-  if (Polly_ENABLE_GPGPU_CODEGEN)
-    set_property(TARGET Polly APPEND PROPERTY INTERFACE_LINK_LIBRARIES PollyPPCG)
-  endif()
 endif()

 if (NOT TARGET LLVMPolly)
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@ -21,3 +21,5 @@ In Polly |version| the following important changes have been incorporated.
  In the future we hope that Polly can collaborate better with LoopVectorize,
  like Polly marking a loop is safe to vectorize with a specific simd width,
  instead of replicating its functionality.
+
+- Polly-ACC has been removed.
--- a/polly/include/polly/CodeGen/PPCGCodeGeneration.h
+++ b/polly/include/polly/CodeGen/PPCGCodeGeneration.h
@ -1,33 +0,0 @@
-//===--- polly/PPCGCodeGeneration.h - Polly Accelerator Code Generation. --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Take a scop created by ScopInfo and map it to GPU code using the ppcg
-// GPU mapping strategy.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_PPCGCODEGENERATION_H
-#define POLLY_PPCGCODEGENERATION_H
-
-/// The GPU Architecture to target.
-enum GPUArch { NVPTX64, SPIR32, SPIR64 };
-
-/// The GPU Runtime implementation to use.
-enum GPURuntime { CUDA, OpenCL };
-
-namespace polly {
-extern bool PollyManagedMemory;
-
-/// Use for pass instantiation defaults.
-/// @{
-extern GPURuntime GPURuntimeChoice;
-extern GPUArch GPUArchChoice;
-/// @}
-} // namespace polly
-
-#endif // POLLY_PPCGCODEGENERATION_H
--- a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
+++ b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
@ -30,24 +30,20 @@ namespace polly {
 struct RuntimeDebugBuilder {

  /// Generate a constant string into the builder's llvm::Module which can be
-  /// passed to createGPUPrinter() or createGPUPrinter().
+  /// passed to createCPUPrinter().
  ///
  /// @param Builder The builder used to emit the printer calls.
  /// @param Str     The string to be printed.

  /// @return        A global containing @p Str.
  static llvm::Value *getPrintableString(PollyIRBuilder &Builder,
-                                         llvm::StringRef Str) {
-    // TODO: Get rid of magic number 4. It it NVPTX's constant address space and
-    // works on X86 (CPU) only because its backend ignores the address space.
-    return Builder.CreateGlobalStringPtr(Str, "", 4);
-  }
+                                         llvm::StringRef Str);

  /// Return whether an llvm::Value of the type @p Ty is printable for
  /// debugging.
  ///
-  /// That is, whether such a value can be passed to createGPUPrinter() or
-  /// createGPUPrinter() to be dumped as runtime.  If false is returned, those
+  /// That is, whether such a value can be passed to createGPUPrinter()
+  /// to be dumped as runtime.  If false is returned, those
  /// functions will fail.
  static bool isPrintable(llvm::Type *Ty);

@ -64,62 +60,41 @@ struct RuntimeDebugBuilder {
  template <typename... Args>
  static void createCPUPrinter(PollyIRBuilder &Builder, Args... args) {
    std::vector<llvm::Value *> Vector;
-    createPrinter(Builder, /* CPU */ false, Vector, args...);
-  }
-
-  /// Print a set of LLVM-IR Values or StringRefs on an NVIDIA GPU.
-  ///
-  ///  This function emits a call to vprintf that will print the given
-  ///  arguments from within a kernel thread. It is useful for debugging
-  ///  CUDA program kernels. All arguments given in this list will be
-  ///  automatically concatenated and the resulting string will be printed
-  ///  atomically. We also support ArrayRef arguments, which can be used to
-  ///  provide for example a list of thread-id values.
-  ///
-  ///  @param Builder The builder used to emit the printer calls.
-  ///  @param Args    The list of values to print.
-  template <typename... Args>
-  static void createGPUPrinter(PollyIRBuilder &Builder, Args... args) {
-    std::vector<llvm::Value *> Vector;
-    createPrinter(Builder, /* GPU */ true, Vector, args...);
+    createPrinter(Builder, Vector, args...);
  }

 private:
  /// Handle Values.
  template <typename... Args>
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                            std::vector<llvm::Value *> &Values,
                            llvm::Value *Value, Args... args) {
    Values.push_back(Value);
-    createPrinter(Builder, UseGPU, Values, args...);
+    createPrinter(Builder, Values, args...);
  }

  /// Handle StringRefs.
  template <typename... Args>
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                            std::vector<llvm::Value *> &Values,
                            llvm::StringRef String, Args... args) {
    Values.push_back(getPrintableString(Builder, String));
-    createPrinter(Builder, UseGPU, Values, args...);
+    createPrinter(Builder, Values, args...);
  }

  /// Handle ArrayRefs.
  template <typename... Args>
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                            std::vector<llvm::Value *> &Values,
                            llvm::ArrayRef<llvm::Value *> Array, Args... args) {
    Values.insert(Values.end(), Array.begin(), Array.end());
-    createPrinter(Builder, UseGPU, Values, args...);
+    createPrinter(Builder, Values, args...);
  }

  /// Print a list of Values.
-  static void createPrinter(PollyIRBuilder &Builder, bool UseGPU,
+  static void createPrinter(PollyIRBuilder &Builder,
                            llvm::ArrayRef<llvm::Value *> Values);

-  /// Print a list of Values on a GPU.
-  static void createGPUPrinterT(PollyIRBuilder &Builder,
-                                llvm::ArrayRef<llvm::Value *> Values);
-
  /// Print a list of Values on a CPU.
  static void createCPUPrinterT(PollyIRBuilder &Builder,
                                llvm::ArrayRef<llvm::Value *> Values);
@ -145,22 +120,6 @@ private:
  ///
  /// @parma Builder The builder used to insert the code.
  static void createFlush(PollyIRBuilder &Builder);
-
-  /// Get (and possibly insert) a NVIDIA address space cast call.
-  static llvm::Function *getAddressSpaceCast(PollyIRBuilder &Builder,
-                                             unsigned Src, unsigned Dst,
-                                             unsigned SrcBits = 8,
-                                             unsigned DstBits = 8);
-
-  /// Get identifiers that describe the currently executed GPU thread.
-  ///
-  /// The result will be a vector that if passed to the GPU printer will result
-  /// into a string (initialized to values corresponding to the printing
-  /// thread):
-  ///
-  ///   "> block-id: bidx bid1y bidz | thread-id: tidx tidy tidz "
-  static std::vector<llvm::Value *>
-  getGPUThreadIdentifiers(PollyIRBuilder &Builder);
 };
 } // namespace polly

--- a/polly/include/polly/Config/config.h.cmake
+++ b/polly/include/polly/Config/config.h.cmake
@ -12,7 +12,4 @@
 #ifndef POLLY_CONFIG_H
 #define POLLY_CONFIG_H

-#cmakedefine CUDA_FOUND
-#cmakedefine GPU_CODEGEN
-
 #endif
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@ -14,7 +14,6 @@
 #ifndef POLLY_LINKALLPASSES_H
 #define POLLY_LINKALLPASSES_H

-#include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/Config/config.h"
 #include "polly/Support/DumpFunctionPass.h"
 #include "polly/Support/DumpModulePass.h"
@ -54,14 +53,6 @@ llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
 llvm::Pass *createIslAstInfoWrapperPassPass();
 llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createCodeGenerationPass();
-#ifdef GPU_CODEGEN
-llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64,
-                                         GPURuntime Runtime = GPURuntime::CUDA);
-
-llvm::Pass *
-createManagedMemoryRewritePassPass(GPUArch Arch = GPUArch::NVPTX64,
-                                   GPURuntime Runtime = GPURuntime::CUDA);
-#endif
 llvm::Pass *createIslScheduleOptimizerWrapperPass();
 llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createFlattenSchedulePass();
@ -113,10 +104,6 @@ struct PollyForcePassLinking {
    polly::createIslAstInfoWrapperPassPass();
    polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
    polly::createCodeGenerationPass();
-#ifdef GPU_CODEGEN
-    polly::createPPCGCodeGenerationPass();
-    polly::createManagedMemoryRewritePassPass();
-#endif
    polly::createIslScheduleOptimizerWrapperPass();
    polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs());
    polly::createMaximalStaticExpansionPass();
@ -156,10 +143,6 @@ void initializeDependenceInfoPrinterLegacyFunctionPassPass(
 void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
 void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeCodeGenerationPass(llvm::PassRegistry &);
-#ifdef GPU_CODEGEN
-void initializePPCGCodeGenerationPass(llvm::PassRegistry &);
-void initializeManagedMemoryRewritePassPass(llvm::PassRegistry &);
-#endif
 void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
 void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &);
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@ -1684,9 +1684,6 @@ private:
  /// Number of copy statements.
  unsigned CopyStmtsNum = 0;

-  /// Flag to indicate if the Scop is to be skipped.
-  bool SkipScop = false;
-
  using StmtSet = std::list<ScopStmt>;

  /// The statements in this Scop.
@ -2144,12 +2141,6 @@ public:
  /// Check if the SCoP has been optimized by the scheduler.
  bool isOptimized() const { return IsOptimized; }

-  /// Mark the SCoP to be skipped by ScopPass passes.
-  void markAsToBeSkipped() { SkipScop = true; }
-
-  /// Check if the SCoP is to be skipped by ScopPass passes.
-  bool isToBeSkipped() const { return SkipScop; }
-
  /// Return the ID of the Scop
  int getID() const { return ID; }

--- a/polly/include/polly/Support/LinkGPURuntime.h
+++ b/polly/include/polly/Support/LinkGPURuntime.h
@ -1,42 +0,0 @@
-//===- Support/LinkGPURuntime.h -- Headerfile to help force-link GPURuntime  =//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This header helps pull in libGPURuntime.so
-//
-//===----------------------------------------------------------------------===//
-#ifndef POLLY_LINK_GPURUNTIME
-#define POLLY_LINK_GPURUNTIME
-
-extern "C" {
-#include "GPURuntime/GPUJIT.h"
-}
-
-namespace polly {
-struct ForceGPURuntimeLinking {
-  ForceGPURuntimeLinking() {
-    if (std::getenv("bar") != (char *)-1)
-      return;
-    // We must reference GPURuntime in such a way that compilers will not
-    // delete it all as dead code, even with whole program optimization,
-    // yet is effectively a NO-OP. As the compiler isn't smart enough
-    // to know that getenv() never returns -1, this will do the job.
-    polly_initContextCL();
-    polly_initContextCUDA();
-    polly_getKernel(nullptr, nullptr);
-    polly_freeKernel(nullptr);
-    polly_copyFromHostToDevice(nullptr, nullptr, 0);
-    polly_copyFromDeviceToHost(nullptr, nullptr, 0);
-    polly_synchronizeDevice();
-    polly_launchKernel(nullptr, 0, 0, 0, 0, 0, nullptr);
-    polly_freeDeviceMemory(nullptr);
-    polly_freeContext(nullptr);
-    polly_synchronizeDevice();
-  }
-} structure;
-} // namespace polly
-#endif
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@ -6,13 +6,6 @@ set(ISL_CODEGEN_FILES
    CodeGen/IslNodeBuilder.cpp
    CodeGen/CodeGeneration.cpp)

-if (GPU_CODEGEN)
-  set (GPGPU_CODEGEN_FILES
-       CodeGen/PPCGCodeGeneration.cpp
-       CodeGen/ManagedMemoryRewrite.cpp
-       )
-endif (GPU_CODEGEN)
-
 # Compile ISL into a separate library.
 add_subdirectory(External)

@ -44,12 +37,6 @@ set(POLLY_COMPONENTS
    Vectorize
 )

-# Polly-ACC requires the NVPTX backend to work. Ask LLVM about its libraries.
-if (GPU_CODEGEN)
-  # This call emits an error if they NVPTX backend is not enable.
-  list(APPEND POLLY_COMPONENTS NVPTX)
-endif ()
-
 # Use an object-library to add the same files to multiple libs without requiring
 # the sources them to be recompiled for each of them.
 add_llvm_pass_plugin(Polly
@ -73,7 +60,6 @@ add_llvm_pass_plugin(Polly
  CodeGen/Utils.cpp
  CodeGen/RuntimeDebugBuilder.cpp
  CodeGen/PerfMonitor.cpp
-  ${GPGPU_CODEGEN_FILES}
  Exchange/JSONExporter.cpp
  Support/GICHelper.cpp
  Support/SCEVAffinator.cpp
@ -127,16 +113,6 @@ target_link_libraries(Polly PUBLIC
  ${ISL_TARGET}
 )

-# Additional dependencies for Polly-ACC.
-if (GPU_CODEGEN)
-  target_link_libraries(Polly PUBLIC PollyPPCG)
-endif ()
-
-if (NOT LLVM_LINK_LLVM_DYLIB AND NOT LLVM_POLLY_LINK_INTO_TOOLS)
-    # Polly-ACC requires the NVPTX target to be present in the executable it is linked to
-    set_property(TARGET bugpoint APPEND PROPERTY LINK_LIBRARIES LLVMTarget)
-endif ()
-
 # Create a loadable module Polly.so that can be loaded using
 # LLVM's/clang's "-load" option.
 if (WIN32 OR NOT LLVM_ENABLE_PIC)
@ -150,19 +126,6 @@ else ()
    $<TARGET_OBJECTS:obj.Polly>
  )

-  # Only add the dependencies that are not part of LLVM. The latter are assumed
-  # to be already available in the address space the module is loaded into.
-  # Adding them once more would have the effect that both copies try to register
-  # the same command line options, to which LLVM reacts with an error.
-  # If Polly-ACC is enabled, the NVPTX target is also expected to reside in the
-  # hosts. This is not the case for bugpoint. Use LLVM_POLLY_LINK_INTO_TOOLS=ON
-  # instead which will automatically resolve the additional dependencies by
-  # Polly.
-  target_link_libraries(LLVMPolly PUBLIC ${ISL_TARGET})
-  if (GPU_CODEGEN)
-    target_link_libraries(LLVMPolly PUBLIC PollyPPCG)
-  endif ()
-
  set_target_properties(LLVMPolly
    PROPERTIES
    LINKER_LANGUAGE CXX
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@ -238,14 +238,8 @@ void BlockGenerator::copyInstScalar(ScopStmt &Stmt, Instruction *Inst,
  Builder.Insert(NewInst);
  BBMap[Inst] = NewInst;

-  // When copying the instruction onto the Module meant for the GPU,
-  // debug metadata attached to an instruction causes all related
-  // metadata to be pulled into the Module. This includes the DICompileUnit,
-  // which will not be listed in llvm.dbg.cu of the Module since the Module
-  // doesn't contain one. This fails the verification of the Module and the
-  // subsequent generation of the ASM string.
-  if (NewInst->getModule() != Inst->getModule())
-    NewInst->setDebugLoc(llvm::DebugLoc());
+  assert(NewInst->getModule() == Inst->getModule() &&
+         "Expecting instructions to be in the same module");

  if (!NewInst->getType()->isVoidTy())
    NewInst->setName("p_" + Inst->getName());
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@ -323,10 +323,6 @@ public:

  /// Generate LLVM-IR for the SCoP @p S.
  bool runOnScop(Scop &S) override {
-    // Skip SCoPs in case they're already code-generated by PPCGCodeGeneration.
-    if (S.isToBeSkipped())
-      return false;
-
    AI = &getAnalysis<IslAstInfoWrapperPass>().getAI();
    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
--- a/polly/lib/CodeGen/IslAst.cpp
+++ b/polly/lib/CodeGen/IslAst.cpp
@ -638,10 +638,6 @@ isl::ast_build IslAstInfo::getBuild(const isl::ast_node &Node) {
 static std::unique_ptr<IslAstInfo> runIslAst(
    Scop &Scop,
    function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps) {
-  // Skip SCoPs in case they're already handled by PPCGCodeGeneration.
-  if (Scop.isToBeSkipped())
-    return {};
-
  ScopsProcessed++;

  const Dependences &D = GetDeps(Dependences::AL_Statement);
--- a/polly/lib/CodeGen/ManagedMemoryRewrite.cpp
+++ b/polly/lib/CodeGen/ManagedMemoryRewrite.cpp
@ -1,427 +0,0 @@
-//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Take a module and rewrite:
-// 1. `malloc` -> `polly_mallocManaged`
-// 2. `free` -> `polly_freeManaged`
-// 3. global arrays with initializers -> global arrays that are initialized
-//                                       with a constructor call to
-//                                       `polly_mallocManaged`.
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/CodeGen/IRBuilder.h"
-#include "polly/CodeGen/PPCGCodeGeneration.h"
-#include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
-#include "polly/Options.h"
-#include "polly/ScopDetection.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-using namespace polly;
-
-static cl::opt<bool> RewriteAllocas(
-    "polly-acc-rewrite-allocas",
-    cl::desc(
-        "Ask the managed memory rewriter to also rewrite alloca instructions"),
-    cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<bool> IgnoreLinkageForGlobals(
-    "polly-acc-rewrite-ignore-linkage-for-globals",
-    cl::desc(
-        "By default, we only rewrite globals with internal linkage. This flag "
-        "enables rewriting of globals regardless of linkage"),
-    cl::Hidden, cl::cat(PollyCategory));
-
-#define DEBUG_TYPE "polly-acc-rewrite-managed-memory"
-namespace {
-
-static llvm::Function *getOrCreatePollyMallocManaged(Module &M) {
-  const char *Name = "polly_mallocManaged";
-  Function *F = M.getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    PollyIRBuilder Builder(M.getContext());
-    // TODO: How do I get `size_t`? I assume from DataLayout?
-    FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(),
-                                         {Builder.getInt64Ty()}, false);
-    F = Function::Create(Ty, Linkage, Name, &M);
-  }
-
-  return F;
-}
-
-static llvm::Function *getOrCreatePollyFreeManaged(Module &M) {
-  const char *Name = "polly_freeManaged";
-  Function *F = M.getFunction(Name);
-
-  // If F is not available, declare it.
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    PollyIRBuilder Builder(M.getContext());
-    // TODO: How do I get `size_t`? I assume from DataLayout?
-    FunctionType *Ty =
-        FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);
-    F = Function::Create(Ty, Linkage, Name, &M);
-  }
-
-  return F;
-}
-
-// Expand a constant expression `Cur`, which is used at instruction `Parent`
-// at index `index`.
-// Since a constant expression can expand to multiple instructions, store all
-// the expands into a set called `Expands`.
-// Note that this goes inorder on the constant expression tree.
-// A * ((B * D) + C)
-// will be processed with first A, then B * D, then B, then D, and then C.
-// Though ConstantExprs are not treated as "trees" but as DAGs, since you can
-// have something like this:
-//    *
-//   /  \
-//   \  /
-//    (D)
-//
-// For the purposes of this expansion, we expand the two occurences of D
-// separately. Therefore, we expand the DAG into the tree:
-//  *
-// / \
-// D  D
-// TODO: We don't _have_to do this, but this is the simplest solution.
-// We can write a solution that keeps track of which constants have been
-// already expanded.
-static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder,
-                               Instruction *Parent, int index,
-                               SmallPtrSet<Instruction *, 4> &Expands) {
-  assert(Cur && "invalid constant expression passed");
-  Instruction *I = Cur->getAsInstruction();
-  assert(I && "unable to convert ConstantExpr to Instruction");
-
-  LLVM_DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur
-                    << ") in Instruction: (" << *I << ")\n";);
-
-  // Invalidate `Cur` so that no one after this point uses `Cur`. Rather,
-  // they should mutate `I`.
-  Cur = nullptr;
-
-  Expands.insert(I);
-  Parent->setOperand(index, I);
-
-  // The things that `Parent` uses (its operands) should be created
-  // before `Parent`.
-  Builder.SetInsertPoint(Parent);
-  Builder.Insert(I);
-
-  for (unsigned i = 0; i < I->getNumOperands(); i++) {
-    Value *Op = I->getOperand(i);
-    assert(isa<Constant>(Op) && "constant must have a constant operand");
-
-    if (ConstantExpr *CExprOp = dyn_cast<ConstantExpr>(Op))
-      expandConstantExpr(CExprOp, Builder, I, i, Expands);
-  }
-}
-
-// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite
-// `ConstantExpr`s that are used in the `Inst`.
-// Note that `replaceAllUsesWith` is insufficient for this purpose because it
-// does not rewrite values in `ConstantExpr`s.
-static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal,
-                               PollyIRBuilder &Builder) {
-
-  // This contains a set of instructions in which OldVal must be replaced.
-  // We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s
-  // from `Inst`s arguments.
-  // We need to go through this process because `replaceAllUsesWith` does not
-  // actually edit `ConstantExpr`s.
-  SmallPtrSet<Instruction *, 4> InstsToVisit = {Inst};
-
-  // Expand all `ConstantExpr`s and place it in `InstsToVisit`.
-  for (unsigned i = 0; i < Inst->getNumOperands(); i++) {
-    Value *Operand = Inst->getOperand(i);
-    if (ConstantExpr *ValueConstExpr = dyn_cast<ConstantExpr>(Operand))
-      expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit);
-  }
-
-  // Now visit each instruction and use `replaceUsesOfWith`. We know that
-  // will work because `I` cannot have any `ConstantExpr` within it.
-  for (Instruction *I : InstsToVisit)
-    I->replaceUsesOfWith(OldVal, NewVal);
-}
-
-// Given a value `Current`, return all Instructions that may contain `Current`
-// in an expression.
-// We need this auxiliary function, because if we have a
-// `Constant` that is a user of `V`, we need to recurse into the
-// `Constant`s uses to gather the root instruction.
-static void getInstructionUsersOfValue(Value *V,
-                                       SmallVector<Instruction *, 4> &Owners) {
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    Owners.push_back(I);
-  } else {
-    // Anything that is a `User` must be a constant or an instruction.
-    auto *C = cast<Constant>(V);
-    for (Use &CUse : C->uses())
-      getInstructionUsersOfValue(CUse.getUser(), Owners);
-  }
-}
-
-static void
-replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array,
-                   SmallPtrSet<GlobalVariable *, 4> &ReplacedGlobals) {
-  // We only want arrays.
-  ArrayType *ArrayTy = dyn_cast<ArrayType>(Array.getValueType());
-  if (!ArrayTy)
-    return;
-  Type *ElemTy = ArrayTy->getElementType();
-  PointerType *ElemPtrTy = ElemTy->getPointerTo();
-
-  // We only wish to replace arrays that are visible in the module they
-  // inhabit. Otherwise, our type edit from [T] to T* would be illegal across
-  // modules.
-  const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() ||
-                                       Array.hasInternalLinkage() ||
-                                       IgnoreLinkageForGlobals;
-  if (!OnlyVisibleInsideModule) {
-    LLVM_DEBUG(
-        dbgs() << "Not rewriting (" << Array
-               << ") to managed memory "
-                  "because it could be visible externally. To force rewrite, "
-                  "use -polly-acc-rewrite-ignore-linkage-for-globals.\n");
-    return;
-  }
-
-  if (!Array.hasInitializer() ||
-      !isa<ConstantAggregateZero>(Array.getInitializer())) {
-    LLVM_DEBUG(dbgs() << "Not rewriting (" << Array
-                      << ") to managed memory "
-                         "because it has an initializer which is "
-                         "not a zeroinitializer.\n");
-    return;
-  }
-
-  // At this point, we have committed to replacing this array.
-  ReplacedGlobals.insert(&Array);
-
-  std::string NewName = Array.getName().str();
-  NewName += ".toptr";
-  GlobalVariable *ReplacementToArr =
-      cast<GlobalVariable>(M.getOrInsertGlobal(NewName, ElemPtrTy));
-  ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy));
-
-  Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
-  std::string FnName = Array.getName().str();
-  FnName += ".constructor";
-  PollyIRBuilder Builder(M.getContext());
-  FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
-  const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-  Function *F = Function::Create(Ty, Linkage, FnName, &M);
-  BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F);
-  Builder.SetInsertPoint(Start);
-
-  const uint64_t ArraySizeInt = DL.getTypeAllocSize(ArrayTy);
-  Value *ArraySize = Builder.getInt64(ArraySizeInt);
-  ArraySize->setName("array.size");
-
-  Value *AllocatedMemRaw =
-      Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw");
-  Value *AllocatedMemTyped =
-      Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed");
-  Builder.CreateStore(AllocatedMemTyped, ReplacementToArr);
-  Builder.CreateRetVoid();
-
-  const int Priority = 0;
-  appendToGlobalCtors(M, F, Priority, ReplacementToArr);
-
-  SmallVector<Instruction *, 4> ArrayUserInstructions;
-  // Get all instructions that use array. We need to do this weird thing
-  // because `Constant`s that contain this array neeed to be expanded into
-  // instructions so that we can replace their parameters. `Constant`s cannot
-  // be edited easily, so we choose to convert all `Constant`s to
-  // `Instruction`s and handle all of the uses of `Array` uniformly.
-  for (Use &ArrayUse : Array.uses())
-    getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions);
-
-  for (Instruction *UserOfArrayInst : ArrayUserInstructions) {
-
-    Builder.SetInsertPoint(UserOfArrayInst);
-    // <ty>** -> <ty>*
-    Value *ArrPtrLoaded =
-        Builder.CreateLoad(ElemPtrTy, ReplacementToArr, "arrptr.load");
-    // <ty>* -> [ty]*
-    Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast(
-        ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast");
-    rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder);
-  }
-}
-
-// We return all `allocas` that may need to be converted to a call to
-// cudaMallocManaged.
-static void getAllocasToBeManaged(Function &F,
-                                  SmallSet<AllocaInst *, 4> &Allocas) {
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      auto *Alloca = dyn_cast<AllocaInst>(&I);
-      if (!Alloca)
-        continue;
-      LLVM_DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: ");
-
-      if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false,
-                               /* StoreCaptures */ true)) {
-        Allocas.insert(Alloca);
-        LLVM_DEBUG(dbgs() << "YES (captured).\n");
-      } else {
-        LLVM_DEBUG(dbgs() << "NO (not captured).\n");
-      }
-    }
-  }
-}
-
-static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca,
-                                         const DataLayout &DL) {
-  LLVM_DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n");
-  Module *M = Alloca->getModule();
-  assert(M && "Alloca does not have a module");
-
-  PollyIRBuilder Builder(M->getContext());
-  Builder.SetInsertPoint(Alloca);
-
-  Function *MallocManagedFn =
-      getOrCreatePollyMallocManaged(*Alloca->getModule());
-  const uint64_t Size = DL.getTypeAllocSize(Alloca->getAllocatedType());
-  Value *SizeVal = Builder.getInt64(Size);
-  Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal});
-  Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType());
-
-  Function *F = Alloca->getFunction();
-  assert(F && "Alloca has invalid function");
-
-  Bitcasted->takeName(Alloca);
-  Alloca->replaceAllUsesWith(Bitcasted);
-  Alloca->eraseFromParent();
-
-  for (BasicBlock &BB : *F) {
-    ReturnInst *Return = dyn_cast<ReturnInst>(BB.getTerminator());
-    if (!Return)
-      continue;
-    Builder.SetInsertPoint(Return);
-
-    Function *FreeManagedFn = getOrCreatePollyFreeManaged(*M);
-    Builder.CreateCall(FreeManagedFn, {RawManagedMem});
-  }
-}
-
-// Replace all uses of `Old` with `New`, even inside `ConstantExpr`.
-//
-// `replaceAllUsesWith` does replace values in `ConstantExpr`. This function
-// actually does replace it in `ConstantExpr`. The caveat is that if there is
-// a use that is *outside* a function (say, at global declarations), we fail.
-// So, this is meant to be used on values which we know will only be used
-// within functions.
-//
-// This process works by looking through the uses of `Old`. If it finds a
-// `ConstantExpr`, it recursively looks for the owning instruction.
-// Then, it expands all the `ConstantExpr` to instructions and replaces
-// `Old` with `New` in the expanded instructions.
-static void replaceAllUsesAndConstantUses(Value *Old, Value *New,
-                                          PollyIRBuilder &Builder) {
-  SmallVector<Instruction *, 4> UserInstructions;
-  // Get all instructions that use array. We need to do this weird thing
-  // because `Constant`s that contain this array neeed to be expanded into
-  // instructions so that we can replace their parameters. `Constant`s cannot
-  // be edited easily, so we choose to convert all `Constant`s to
-  // `Instruction`s and handle all of the uses of `Array` uniformly.
-  for (Use &ArrayUse : Old->uses())
-    getInstructionUsersOfValue(ArrayUse.getUser(), UserInstructions);
-
-  for (Instruction *I : UserInstructions)
-    rewriteOldValToNew(I, Old, New, Builder);
-}
-
-class ManagedMemoryRewritePass final : public ModulePass {
-public:
-  static char ID;
-  GPUArch Architecture;
-  GPURuntime Runtime;
-
-  ManagedMemoryRewritePass() : ModulePass(ID) {}
-  bool runOnModule(Module &M) override {
-    const DataLayout &DL = M.getDataLayout();
-
-    Function *Malloc = M.getFunction("malloc");
-
-    if (Malloc) {
-      PollyIRBuilder Builder(M.getContext());
-      Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
-      assert(PollyMallocManaged && "unable to create polly_mallocManaged");
-
-      replaceAllUsesAndConstantUses(Malloc, PollyMallocManaged, Builder);
-      Malloc->eraseFromParent();
-    }
-
-    Function *Free = M.getFunction("free");
-
-    if (Free) {
-      PollyIRBuilder Builder(M.getContext());
-      Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M);
-      assert(PollyFreeManaged && "unable to create polly_freeManaged");
-
-      replaceAllUsesAndConstantUses(Free, PollyFreeManaged, Builder);
-      Free->eraseFromParent();
-    }
-
-    SmallPtrSet<GlobalVariable *, 4> GlobalsToErase;
-    for (GlobalVariable &Global : M.globals())
-      replaceGlobalArray(M, DL, Global, GlobalsToErase);
-    for (GlobalVariable *G : GlobalsToErase)
-      G->eraseFromParent();
-
-    // Rewrite allocas to cudaMallocs if we are asked to do so.
-    if (RewriteAllocas) {
-      SmallSet<AllocaInst *, 4> AllocasToBeManaged;
-      for (Function &F : M.functions())
-        getAllocasToBeManaged(F, AllocasToBeManaged);
-
-      for (AllocaInst *Alloca : AllocasToBeManaged)
-        rewriteAllocaAsManagedMemory(Alloca, DL);
-    }
-
-    return true;
-  }
-};
-} // namespace
-char ManagedMemoryRewritePass::ID = 42;
-
-Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch,
-                                                GPURuntime Runtime) {
-  ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass();
-  pass->Runtime = Runtime;
-  pass->Architecture = Arch;
-  return pass;
-}
-
-INITIALIZE_PASS_BEGIN(
-    ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
-    "Polly - Rewrite all allocations in heap & data section to managed memory",
-    false, false)
-INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
-INITIALIZE_PASS_END(
-    ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
-    "Polly - Rewrite all allocations in heap & data section to managed memory",
-    false, false)
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
--- a/polly/lib/CodeGen/RuntimeDebugBuilder.cpp
+++ b/polly/lib/CodeGen/RuntimeDebugBuilder.cpp
@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//

 #include "polly/CodeGen/RuntimeDebugBuilder.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
 #include <string>
 #include <vector>
@ -17,6 +16,16 @@
 using namespace llvm;
 using namespace polly;

+llvm::Value *RuntimeDebugBuilder::getPrintableString(PollyIRBuilder &Builder,
+                                                     llvm::StringRef Str) {
+  // FIXME: addressspace(4) is a marker for a string (for the %s conversion
+  // specifier) but should be using the default address space. This only works
+  // because CPU backends typically ignore the address space. For constant
+  // strings as returned by getPrintableString, the format string should instead
+  // directly spell out the string.
+  return Builder.CreateGlobalStringPtr(Str, "", 4);
+}
+
 Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  const char *Name = "vprintf";
@ -33,72 +42,9 @@ Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
  return F;
 }

-Function *RuntimeDebugBuilder::getAddressSpaceCast(PollyIRBuilder &Builder,
-                                                   unsigned Src, unsigned Dst,
-                                                   unsigned SrcBits,
-                                                   unsigned DstBits) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  auto Name = std::string("llvm.nvvm.ptr.constant.to.gen.p") +
-              std::to_string(Dst) + "i" + std::to_string(DstBits) + ".p" +
-              std::to_string(Src) + "i" + std::to_string(SrcBits);
-  Function *F = M->getFunction(Name);
-
-  if (!F) {
-    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    FunctionType *Ty = FunctionType::get(
-        PointerType::get(Builder.getIntNTy(DstBits), Dst),
-        PointerType::get(Builder.getIntNTy(SrcBits), Src), false);
-    F = Function::Create(Ty, Linkage, Name, M);
-  }
-
-  return F;
-}
-
-std::vector<Value *>
-RuntimeDebugBuilder::getGPUThreadIdentifiers(PollyIRBuilder &Builder) {
-  std::vector<Value *> Identifiers;
-
-  auto M = Builder.GetInsertBlock()->getParent()->getParent();
-
-  std::vector<Function *> BlockIDs = {
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_x),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_y),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_z),
-  };
-
-  Identifiers.push_back(Builder.CreateGlobalStringPtr("> block-id: ", "", 4));
-  for (auto GetID : BlockIDs) {
-    Value *Id = Builder.CreateCall(GetID, {});
-    Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
-    Identifiers.push_back(Id);
-    Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
-  }
-
-  Identifiers.push_back(Builder.CreateGlobalStringPtr("| ", "", 4));
-
-  std::vector<Function *> ThreadIDs = {
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_x),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_y),
-      Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_z),
-  };
-
-  Identifiers.push_back(Builder.CreateGlobalStringPtr("thread-id: ", "", 4));
-  for (auto GetId : ThreadIDs) {
-    Value *Id = Builder.CreateCall(GetId, {});
-    Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
-    Identifiers.push_back(Id);
-    Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
-  }
-
-  return Identifiers;
-}
-
-void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, bool IsGPU,
+void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder,
                                        ArrayRef<Value *> Values) {
-  if (IsGPU)
-    createGPUPrinterT(Builder, Values);
-  else
-    createCPUPrinterT(Builder, Values);
+  createCPUPrinterT(Builder, Values);
 }

 bool RuntimeDebugBuilder::isPrintable(Type *Ty) {
@ -169,78 +115,6 @@ void RuntimeDebugBuilder::createCPUPrinterT(PollyIRBuilder &Builder,
  createFlush(Builder);
 }

-void RuntimeDebugBuilder::createGPUPrinterT(PollyIRBuilder &Builder,
-                                            ArrayRef<Value *> Values) {
-  std::string str;
-
-  auto *Zero = Builder.getInt64(0);
-
-  auto ToPrint = getGPUThreadIdentifiers(Builder);
-
-  ToPrint.push_back(Builder.CreateGlobalStringPtr("\n  ", "", 4));
-  ToPrint.insert(ToPrint.end(), Values.begin(), Values.end());
-
-  const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
-
-  // Allocate print buffer (assuming 2*32 bit per element)
-  auto T = ArrayType::get(Builder.getInt32Ty(), ToPrint.size() * 2);
-  Value *Data = new AllocaInst(
-      T, DL.getAllocaAddrSpace(), "polly.vprint.buffer",
-      &Builder.GetInsertBlock()->getParent()->getEntryBlock().front());
-  auto *DataPtr = Builder.CreateGEP(T, Data, {Zero, Zero});
-
-  int Offset = 0;
-  for (auto Val : ToPrint) {
-    auto Ptr = Builder.CreateGEP(Builder.getInt32Ty(), DataPtr,
-                                 Builder.getInt64(Offset));
-    Type *Ty = Val->getType();
-
-    if (Ty->isFloatingPointTy()) {
-      if (!Ty->isDoubleTy())
-        Val = Builder.CreateFPExt(Val, Builder.getDoubleTy());
-    } else if (Ty->isIntegerTy()) {
-      if (Ty->getIntegerBitWidth() < 64) {
-        Val = Builder.CreateSExt(Val, Builder.getInt64Ty());
-      } else {
-        assert(Ty->getIntegerBitWidth() == 64 &&
-               "Integer types larger 64 bit not supported");
-        // fallthrough
-      }
-    } else if (isa<PointerType>(Ty)) {
-      if (Ty == Builder.getInt8PtrTy(4)) {
-        // Pointers in constant address space are printed as strings
-        Val = Builder.CreateGEP(Builder.getInt8Ty(), Val, Builder.getInt64(0));
-        auto F = RuntimeDebugBuilder::getAddressSpaceCast(Builder, 4, 0);
-        Val = Builder.CreateCall(F, Val);
-      } else {
-        Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty());
-      }
-    } else {
-      llvm_unreachable("Unknown type");
-    }
-
-    Ty = Val->getType();
-    Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Ty->getPointerTo(5));
-    Builder.CreateAlignedStore(Val, Ptr, Align(4));
-
-    if (Ty->isFloatingPointTy())
-      str += "%f";
-    else if (Ty->isIntegerTy())
-      str += "%ld";
-    else
-      str += "%s";
-
-    Offset += 2;
-  }
-
-  Value *Format = Builder.CreateGlobalStringPtr(str, "polly.vprintf.buffer", 4);
-  Format = Builder.CreateCall(getAddressSpaceCast(Builder, 4, 0), Format);
-
-  Data = Builder.CreateBitCast(Data, Builder.getInt8PtrTy());
-
-  Builder.CreateCall(getVPrintF(Builder), {Format, Data});
-}
-
 Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) {
  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
  const char *Name = "printf";
--- a/polly/lib/External/CMakeLists.txt
+++ b/polly/lib/External/CMakeLists.txt
@ -314,91 +314,3 @@ if (POLLY_BUNDLED_ISL)
  target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
  target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
 endif (POLLY_BUNDLED_ISL)
-
-
-# External: Polyhedral Parallel Code Generator
-if (GPU_CODEGEN)
-  set(PET_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/pet")
-  set(PPCG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ppcg")
-  set(PPCG_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/ppcg")
-
-  # Determine version of ppcg
-  if (EXISTS "${PPCG_SOURCE_DIR}/GIT_HEAD_ID")
-    # The source comes from a 'make dist' archive
-    file(READ "${PPCG_SOURCE_DIR}/GIT_HEAD_ID" PPCG_GIT_HEAD_ID)
-    string(STRIP "${PPCG_GIT_HEAD_ID}" PPCG_GIT_HEAD_ID)
-  elseif (EXISTS "${PPCG_SOURCE_DIR}/gitversion.h")
-    # The source directory is preconfigured
-    file(READ "${PPCG_SOURCE_DIR}/gitversion.h" GITVERSION_H)
-    string(REGEX REPLACE ".*\\\"([^\\\"]*)\\\".*" "\\1" PPCG_GIT_HEAD_ID "${GITVERSION_H}")
-  elseif ()
-    # Unknown revision
-    # TODO: We could look for a .git and get the revision from HEAD
-    set(PPCG_GIT_HEAD_ID "UNKNOWN")
-  endif ()
-
-  message(STATUS "PPCG version: ${PPCG_GIT_HEAD_ID}")
-
-  set (PPCG_FILES
-       ppcg/cuda.c
-       ppcg/cuda_common.c
-       ppcg/external.c
-       ppcg/gpu_array_tile.c
-       ppcg/gpu.c
-       ppcg/gpu_array_tile.c
-       ppcg/gpu_group.c
-       ppcg/gpu_hybrid.c
-       ppcg/gpu_print.c
-       ppcg/gpu_tree.c
-       ppcg/grouping.c
-       ppcg/hybrid.c
-       ppcg/ppcg.c
-       ppcg/ppcg_options.c
-       ppcg/print.c
-       ppcg/schedule.c
-       ppcg/util.c
-       )
-
-  include_directories(BEFORE
-    ${PPCG_BINARY_DIR}
-    ${PPCG_SOURCE_DIR}/imath
-    ${PPCG_SOURCE_DIR}/include
-    ${PET_SOURCE_DIR}/include
-  )
-
-  add_polly_library(PollyPPCG
-    ${PPCG_FILES}
-  )
-
-  target_link_libraries(PollyPPCG PUBLIC ${ISL_TARGET})
-
-  # Disable warnings for upstream projects.
-  if (MSVC)
-    set(DISABLE_WARNING_FLAGS
-      -wd4018 # 'expression' : signed/unsigned mismatch
-      -wd4090 # 'operation' : different 'modifier' qualifiers
-      -wd4200 # nonstandard extension used: zero-sized array in struct/union
-      -wd4201 # nonstandard extension used: nameless struct/union
-      -wd4334 # 'operator': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
-      -wd4221 # nonstandard extension used : 'identifier' : cannot be initialized using address of automatic variable
-    )
-    if (POLLY_BUNDLED_ISL)
-      target_compile_options(PollyISL PRIVATE ${DISABLE_WARNING_FLAGS})
-      target_compile_options(polly-isl-test PRIVATE ${DISABLE_WARNING_FLAGS})
-    endif (POLLY_BUNDLED_ISL)
-    target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
-  else ()
-    if (POLLY_BUNDLED_ISL)
-      set_target_properties(PollyISL polly-isl-test PROPERTIES COMPILE_FLAGS "-w")
-    endif (POLLY_BUNDLED_ISL)
-    set_target_properties(PollyPPCG PROPERTIES COMPILE_FLAGS "-w")
-  endif ()
-
-  if(MSVC)
-    # In the Windows API (with some exceptions), the maximum length for a path is
-    # MAX_PATH, which is defined as 260 characters.
-    target_compile_definitions(PollyPPCG PRIVATE "-DPATH_MAX=260")
-  endif ()
-
-  target_compile_options(PollyPPCG PRIVATE ${DISABLE_WARNING_FLAGS})
-endif ()
--- a/polly/lib/External/pet/include/pet.h
+++ b/polly/lib/External/pet/include/pet.h
@ -1,622 +0,0 @@
-#ifndef PET_H
-#define PET_H
-
-#include <isl/aff.h>
-#include <isl/arg.h>
-#include <isl/ast_build.h>
-#include <isl/set.h>
-#include <isl/map.h>
-#include <isl/union_map.h>
-#include <isl/printer.h>
-#include <isl/id_to_ast_expr.h>
-#include <isl/id_to_pw_aff.h>
-#include <isl/schedule.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-struct pet_options;
-ISL_ARG_DECL(pet_options, struct pet_options, pet_options_args)
-
-/* Create an isl_ctx that references the pet options. */
-isl_ctx *isl_ctx_alloc_with_pet_options();
-
-/* If autodetect is set, any valid scop is extracted.
- * Otherwise, the scop needs to be delimited by pragmas.
- */
-int pet_options_set_autodetect(isl_ctx *ctx, int val);
-int pet_options_get_autodetect(isl_ctx *ctx);
-
-int pet_options_set_detect_conditional_assignment(isl_ctx *ctx, int val);
-int pet_options_get_detect_conditional_assignment(isl_ctx *ctx);
-
-/* If encapsulate-dynamic-control is set, then any dynamic control
- * in the input program will be encapsulated in macro statements.
- * This means in particular that no statements with arguments
- * will be created.
- */
-int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val);
-int pet_options_get_encapsulate_dynamic_control(isl_ctx *ctx);
-
-#define	PET_OVERFLOW_AVOID	0
-#define	PET_OVERFLOW_IGNORE	1
-int pet_options_set_signed_overflow(isl_ctx *ctx, int val);
-int pet_options_get_signed_overflow(isl_ctx *ctx);
-
-struct pet_loc;
-typedef struct pet_loc pet_loc;
-
-/* Return an additional reference to "loc". */
-__isl_give pet_loc *pet_loc_copy(__isl_keep pet_loc *loc);
-/* Free a reference to "loc". */
-pet_loc *pet_loc_free(__isl_take pet_loc *loc);
-
-/* Return the offset in the input file of the start of "loc". */
-unsigned pet_loc_get_start(__isl_keep pet_loc *loc);
-/* Return the offset in the input file of the character after "loc". */
-unsigned pet_loc_get_end(__isl_keep pet_loc *loc);
-/* Return the line number of a line within the "loc" region. */
-int pet_loc_get_line(__isl_keep pet_loc *loc);
-/* Return the indentation of the "loc" region. */
-__isl_keep const char *pet_loc_get_indent(__isl_keep pet_loc *loc);
-
-enum pet_expr_type {
-	pet_expr_error = -1,
-	pet_expr_access,
-	pet_expr_call,
-	pet_expr_cast,
-	pet_expr_int,
-	pet_expr_double,
-	pet_expr_op
-};
-
-enum pet_op_type {
-	/* only compound assignments operators before assignment */
-	pet_op_add_assign,
-	pet_op_sub_assign,
-	pet_op_mul_assign,
-	pet_op_div_assign,
-	pet_op_and_assign,
-	pet_op_xor_assign,
-	pet_op_or_assign,
-	pet_op_assign,
-	pet_op_add,
-	pet_op_sub,
-	pet_op_mul,
-	pet_op_div,
-	pet_op_mod,
-	pet_op_shl,
-	pet_op_shr,
-	pet_op_eq,
-	pet_op_ne,
-	pet_op_le,
-	pet_op_ge,
-	pet_op_lt,
-	pet_op_gt,
-	pet_op_minus,
-	pet_op_post_inc,
-	pet_op_post_dec,
-	pet_op_pre_inc,
-	pet_op_pre_dec,
-	pet_op_address_of,
-	pet_op_assume,
-	pet_op_kill,
-	pet_op_and,
-	pet_op_xor,
-	pet_op_or,
-	pet_op_not,
-	pet_op_land,
-	pet_op_lor,
-	pet_op_lnot,
-	pet_op_cond,
-	pet_op_last
-};
-
-/* Index into the pet_expr->args array when pet_expr->type == pet_expr_unary
- */
-enum pet_un_arg_type {
-	pet_un_arg
-};
-
-/* Indices into the pet_expr->args array when
- * pet_expr->type == pet_expr_binary
- */
-enum pet_bin_arg_type {
-	pet_bin_lhs,
-	pet_bin_rhs
-};
-
-/* Indices into the pet_expr->args array when
- * pet_expr->type == pet_expr_ternary
- */
-enum pet_ter_arg_type {
-	pet_ter_cond,
-	pet_ter_true,
-	pet_ter_false
-};
-
-struct pet_expr;
-typedef struct pet_expr pet_expr;
-
-/* Return an additional reference to "expr". */
-__isl_give pet_expr *pet_expr_copy(__isl_keep pet_expr *expr);
-/* Free a reference to "expr". */
-__isl_null pet_expr *pet_expr_free(__isl_take pet_expr *expr);
-
-/* Return the isl_ctx in which "expr" was created. */
-isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr);
-
-/* Return the type of "expr". */
-enum pet_expr_type pet_expr_get_type(__isl_keep pet_expr *expr);
-/* Return the number of arguments of "expr". */
-int pet_expr_get_n_arg(__isl_keep pet_expr *expr);
-/* Set the number of arguments of "expr" to "n". */
-__isl_give pet_expr *pet_expr_set_n_arg(__isl_take pet_expr *expr, int n);
-/* Return the argument of "expr" at position "pos". */
-__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos);
-/* Replace the argument of "expr" at position "pos" by "arg". */
-__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
-	__isl_take pet_expr *arg);
-
-/* Return the operation type of operation expression "expr". */
-enum pet_op_type pet_expr_op_get_type(__isl_keep pet_expr *expr);
-/* Replace the operation type of operation expression "expr" by "type". */
-__isl_give pet_expr *pet_expr_op_set_type(__isl_take pet_expr *expr,
-	enum pet_op_type type);
-
-/* Construct a (read) access pet_expr from an index expression. */
-__isl_give pet_expr *pet_expr_from_index(__isl_take isl_multi_pw_aff *index);
-
-/* Does "expr" represent an affine expression? */
-isl_bool pet_expr_is_affine(__isl_keep pet_expr *expr);
-/* Does the access expression "expr" read the accessed elements? */
-isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr);
-/* Does the access expression "expr" write to the accessed elements? */
-isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr);
-/* Does the access expression "expr" kill the accessed elements? */
-isl_bool pet_expr_access_is_kill(__isl_keep pet_expr *expr);
-/* Mark "expr" as a read depending on "read". */
-__isl_give pet_expr *pet_expr_access_set_read(__isl_take pet_expr *expr,
-	int read);
-/* Mark "expr" as a write depending on "write". */
-__isl_give pet_expr *pet_expr_access_set_write(__isl_take pet_expr *expr,
-	int write);
-/* Mark "expr" as a kill depending on "kill". */
-__isl_give pet_expr *pet_expr_access_set_kill(__isl_take pet_expr *expr,
-	int kill);
-/* Return the reference identifier of access expression "expr". */
-__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr);
-/* Replace the reference identifier of access expression "expr" by "ref_id". */
-__isl_give pet_expr *pet_expr_access_set_ref_id(__isl_take pet_expr *expr,
-	__isl_take isl_id *ref_id);
-/* Return the identifier of the outer array accessed by "expr". */
-__isl_give isl_id *pet_expr_access_get_id(__isl_keep pet_expr *expr);
-/* Return the index expression of access expression "expr". */
-__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
-	__isl_keep pet_expr *expr);
-
-/* Return the potential read access relation of access expression "expr". */
-__isl_give isl_union_map *pet_expr_access_get_may_read(
-	__isl_keep pet_expr *expr);
-/* Return the potential write access relation of access expression "expr". */
-__isl_give isl_union_map *pet_expr_access_get_may_write(
-	__isl_keep pet_expr *expr);
-/* Return the definite write access relation of access expression "expr". */
-__isl_give isl_union_map *pet_expr_access_get_must_write(
-	__isl_keep pet_expr *expr);
-/* Return the argument dependent potential read access relation of "expr". */
-__isl_give isl_union_map *pet_expr_access_get_dependent_may_read(
-	__isl_keep pet_expr *expr);
-/* Return the argument dependent potential write access relation of "expr". */
-__isl_give isl_union_map *pet_expr_access_get_dependent_may_write(
-	__isl_keep pet_expr *expr);
-/* Return the argument dependent definite write access relation of "expr". */
-__isl_give isl_union_map *pet_expr_access_get_dependent_must_write(
-	__isl_keep pet_expr *expr);
-/* Return the tagged potential read access relation of access "expr". */
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
-	__isl_keep pet_expr *expr);
-/* Return the tagged potential write access relation of access "expr". */
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
-	__isl_keep pet_expr *expr);
-
-/* Return the name of the function called by "expr". */
-__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr);
-/* Replace the name of the function called by "expr" by "name". */
-__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
-	__isl_keep const char *name);
-
-/* Create a pet_expr representing a cast of "arg" to "type_name". */
-__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
-	__isl_take pet_expr *arg);
-/* Replace the type of the cast performed by "expr" by "name". */
-__isl_give pet_expr *pet_expr_cast_set_type_name(__isl_take pet_expr *expr,
-	__isl_keep const char *name);
-
-/* Return the value of the integer represented by "expr". */
-__isl_give isl_val *pet_expr_int_get_val(__isl_keep pet_expr *expr);
-/* Replace the value of the integer represented by "expr" by "v". */
-__isl_give pet_expr *pet_expr_int_set_val(__isl_take pet_expr *expr,
-	__isl_take isl_val *v);
-
-/* Return a string representation of the double expression "expr". */
-__isl_give char *pet_expr_double_get_str(__isl_keep pet_expr *expr);
-/* Replace value and string representation of the double expression "expr" */
-__isl_give pet_expr *pet_expr_double_set(__isl_take pet_expr *expr,
-	double d, __isl_keep const char *s);
-
-/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_access. */
-int pet_expr_foreach_access_expr(__isl_keep pet_expr *expr,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-/* Call "fn" on each of the subexpressions of "expr" of type pet_expr_call. */
-int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-
-struct pet_context;
-typedef struct pet_context pet_context;
-
-/* Create a context with the given domain. */
-__isl_give pet_context *pet_context_alloc(__isl_take isl_set *domain);
-/* Return an additional reference to "pc". */
-__isl_give pet_context *pet_context_copy(__isl_keep pet_context *pc);
-/* Free a reference to "pc". */
-__isl_null pet_context *pet_context_free(__isl_take pet_context *pc);
-
-/* Return the isl_ctx in which "pc" was created. */
-isl_ctx *pet_context_get_ctx(__isl_keep pet_context *pc);
-
-/* Extract an affine expression defined over the domain of "pc" from "expr"
- * or return NaN.
- */
-__isl_give isl_pw_aff *pet_expr_extract_affine(__isl_keep pet_expr *expr,
-	__isl_keep pet_context *pc);
-
-void pet_expr_dump(__isl_keep pet_expr *expr);
-
-enum pet_tree_type {
-	pet_tree_error = -1,
-	pet_tree_expr,
-	pet_tree_block,
-	pet_tree_break,
-	pet_tree_continue,
-	pet_tree_decl,		/* A declaration without initialization */
-	pet_tree_decl_init,	/* A declaration with initialization */
-	pet_tree_if,		/* An if without an else branch */
-	pet_tree_if_else,	/* An if with an else branch */
-	pet_tree_for,
-	pet_tree_infinite_loop,
-	pet_tree_while,
-	pet_tree_return,
-};
-
-struct pet_tree;
-typedef struct pet_tree pet_tree;
-
-/* Return the isl_ctx in which "tree" was created. */
-isl_ctx *pet_tree_get_ctx(__isl_keep pet_tree *tree);
-
-/* Return an additional reference to "tree". */
-__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree);
-/* Free a reference to "tree". */
-__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree);
-
-/* Return the location of "tree". */
-__isl_give pet_loc *pet_tree_get_loc(__isl_keep pet_tree *tree);
-
-/* Return the type of "tree". */
-enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree);
-
-/* Return the expression of the expression tree "tree". */
-__isl_give pet_expr *pet_tree_expr_get_expr(__isl_keep pet_tree *tree);
-
-/* Return the expression returned by the return tree "tree". */
-__isl_give pet_expr *pet_tree_return_get_expr(__isl_keep pet_tree *tree);
-
-/* Return the number of children of the block tree "tree". */
-int pet_tree_block_n_child(__isl_keep pet_tree *tree);
-/* Return child "pos" of the block tree "tree". */
-__isl_give pet_tree *pet_tree_block_get_child(__isl_keep pet_tree *tree,
-	int pos);
-
-/* Is "tree" a declaration (with or without initialization)? */
-int pet_tree_is_decl(__isl_keep pet_tree *tree);
-/* Return the variable declared by the declaration tree "tree". */
-__isl_give pet_expr *pet_tree_decl_get_var(__isl_keep pet_tree *tree);
-/* Return the initial value of the pet_tree_decl_init tree "tree". */
-__isl_give pet_expr *pet_tree_decl_get_init(__isl_keep pet_tree *tree);
-
-/* Return the condition of the if tree "tree". */
-__isl_give pet_expr *pet_tree_if_get_cond(__isl_keep pet_tree *tree);
-/* Return the then branch of the if tree "tree". */
-__isl_give pet_tree *pet_tree_if_get_then(__isl_keep pet_tree *tree);
-/* Return the else branch of the if tree with else branch "tree". */
-__isl_give pet_tree *pet_tree_if_get_else(__isl_keep pet_tree *tree);
-
-/* Is "tree" a for loop, a while loop or an infinite loop? */
-int pet_tree_is_loop(__isl_keep pet_tree *tree);
-/* Return the induction variable of the for loop "tree" */
-__isl_give pet_expr *pet_tree_loop_get_var(__isl_keep pet_tree *tree);
-/* Return the initial value of the induction variable of the for loop "tree" */
-__isl_give pet_expr *pet_tree_loop_get_init(__isl_keep pet_tree *tree);
-/* Return the condition of the loop tree "tree" */
-__isl_give pet_expr *pet_tree_loop_get_cond(__isl_keep pet_tree *tree);
-/* Return the induction variable of the for loop "tree" */
-__isl_give pet_expr *pet_tree_loop_get_inc(__isl_keep pet_tree *tree);
-/* Return the body of the loop tree "tree" */
-__isl_give pet_tree *pet_tree_loop_get_body(__isl_keep pet_tree *tree);
-
-/* Call "fn" on each top-level expression in the nodes of "tree" */
-int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-/* Call "fn" on each access subexpression in the nodes of "tree" */
-int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
-	int (*fn)(__isl_keep pet_expr *expr, void *user), void *user);
-/* Modify all call subexpressions in the nodes of "tree" through "fn". */
-__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
-	__isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
-	void *user);
-
-void pet_tree_dump(__isl_keep pet_tree *tree);
-
-/* "loc" represents the region of the source code that is represented
- * by this statement.
- *
- * If the statement has arguments, i.e., n_arg != 0, then
- * "domain" is a wrapped map, mapping the iteration domain
- * to the values of the arguments for which this statement
- * is executed.
- * Otherwise, it is simply the iteration domain.
- *
- * If one of the arguments is an access expression that accesses
- * more than one element for a given iteration, then the constraints
- * on the value of this argument (encoded in "domain") should be satisfied
- * for all of those accessed elements.
- */
-struct pet_stmt {
-	pet_loc *loc;
-	isl_set *domain;
-	pet_tree *body;
-
-	unsigned n_arg;
-	pet_expr **args;
-};
-
-/* Return the iteration space of "stmt". */
-__isl_give isl_space *pet_stmt_get_space(struct pet_stmt *stmt);
-
-/* Is "stmt" an assignment statement? */
-int pet_stmt_is_assign(struct pet_stmt *stmt);
-/* Is "stmt" a kill statement? */
-int pet_stmt_is_kill(struct pet_stmt *stmt);
-
-/* pet_stmt_build_ast_exprs is currently limited to only handle
- * some forms of data dependent accesses.
- * If pet_stmt_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
- * can safely be called on "stmt".
- */
-int pet_stmt_can_build_ast_exprs(struct pet_stmt *stmt);
-/* Construct an associative array from reference identifiers of
- * access expressions in "stmt" to the corresponding isl_ast_expr.
- * Each index expression is first transformed through "fn_index"
- * (if not NULL).  Then an AST expression is generated using "build".
- * Finally, the AST expression is transformed using "fn_expr"
- * (if not NULL).
- */
-__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
-	__isl_keep isl_ast_build *build,
-	__isl_give isl_multi_pw_aff *(*fn_index)(
-		__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
-		void *user), void *user_index,
-	__isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
-		__isl_keep isl_id *id, void *user), void *user_expr);
-
-/* Print "stmt" to "p".
- *
- * The access expressions in "stmt" are replaced by the isl_ast_expr
- * associated to its reference identifier in "ref2expr".
- */
-__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
-	__isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr);
-
-/* This structure represents a defined type.
- * "name" is the name of the type, while "definition" is a string
- * representation of its definition.
- */
-struct pet_type {
-	char *name;
-	char *definition;
-};
-
-/* context holds constraints on the parameter that ensure that
- * this array has a valid (i.e., non-negative) size
- *
- * extent holds constraints on the indices
- *
- * value_bounds holds constraints on the elements of the array
- * and may be NULL if no such constraints were specified by the user
- *
- * element_size is the size in bytes of each array element
- * element_type is the type of the array elements.
- * element_is_record is set if this type is a record type.
- *
- * live_out is set if the array appears in a live-out pragma
- *
- * if uniquely_defined is set then the array is written by a single access
- * such that any element that is ever read
- * is known to be assigned exactly once before the read
- *
- * declared is set if the array was declared somewhere inside the scop.
- * exposed is set if the declared array is visible outside the scop.
- * outer is set if the type of the array elements is a record and
- * the fields of this record are represented by separate pet_array structures.
- */
-struct pet_array {
-	isl_set *context;
-	isl_set *extent;
-	isl_set *value_bounds;
-	char *element_type;
-	int element_is_record;
-	int element_size;
-	int live_out;
-	int uniquely_defined;
-	int declared;
-	int exposed;
-	int outer;
-};
-
-/* This structure represents an implication on a boolean filter.
- * In particular, if the filter value of an element in the domain
- * of "extension" is equal to "satisfied", then the filter values
- * of the corresponding images in "extension" are also equal
- * to "satisfied".
- */
-struct pet_implication {
-	int satisfied;
-	isl_map *extension;
-};
-
-/* This structure represents an independence implied by a for loop
- * that is marked as independent in the source code.
- * "filter" contains pairs of statement instances that are guaranteed
- * not to be dependent on each other based on the independent for loop,
- * assuming that no dependences carried by this loop are implied
- * by the variables in "local".
- * "local" contains the variables that are local to the loop that was
- * marked independent.
- */
-struct pet_independence {
-	isl_union_map *filter;
-	isl_union_set *local;
-};
-
-/* "loc" represents the region of the source code that is represented
- * by this scop.
- * If the scop was detected based on scop and endscop pragmas, then
- * the lines containing these pragmas are included in this region.
- * In the final result, the context describes the set of parameter values
- * for which the scop can be executed.
- * During the construction of the pet_scop, the context lives in a set space
- * where each dimension refers to an outer loop.
- * context_value describes assignments to the parameters (if any)
- * outside of the scop.
- *
- * "schedule" is the schedule of the statements in the scop.
- *
- * The n_type types define types that may be referenced from by the arrays.
- *
- * The n_implication implications describe implications on boolean filters.
- *
- * The n_independence independences describe independences implied
- * by for loops that are marked independent in the source code.
- */
-struct pet_scop {
-	pet_loc *loc;
-
-	isl_set *context;
-	isl_set *context_value;
-	isl_schedule *schedule;
-
-	int n_type;
-	struct pet_type **types;
-
-	int n_array;
-	struct pet_array **arrays;
-
-	int n_stmt;
-	struct pet_stmt **stmts;
-
-	int n_implication;
-	struct pet_implication **implications;
-
-	int n_independence;
-	struct pet_independence **independences;
-};
-typedef struct pet_scop pet_scop;
-
-/* Return a textual representation of the operator. */
-const char *pet_op_str(enum pet_op_type op);
-int pet_op_is_inc_dec(enum pet_op_type op);
-
-/* Extract a pet_scop from a C source file.
- * If function is not NULL, then the pet_scop is extracted from
- * a function with that name.
- */
-__isl_give pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx,
-	const char *filename, const char *function);
-
-/* Transform the C source file "input" by rewriting each scop
- * When autodetecting scops, at most one scop per function is rewritten.
- * The transformed C code is written to "output".
- */
-int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
-	__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
-		__isl_take pet_scop *scop, void *user), void *user);
-/* Given a scop and a printer passed to a pet_transform_C_source callback,
- * print the original corresponding code to the printer.
- */
-__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
-	__isl_take isl_printer *p);
-
-/* Update all isl_sets and isl_maps such that they all have the same
- * parameters in the same order.
- */
-__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop);
-
-/* Does "scop" contain any data dependent accesses? */
-int pet_scop_has_data_dependent_accesses(__isl_keep pet_scop *scop);
-/* Does "scop" contain any data dependent conditions? */
-int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop);
-/* pet_stmt_build_ast_exprs is currently limited to only handle
- * some forms of data dependent accesses.
- * If pet_scop_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
- * can safely be called on all statements in the scop.
- */
-int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop);
-
-void pet_scop_dump(__isl_keep pet_scop *scop);
-__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop);
-
-/* Return the context of "scop". */
-__isl_give isl_set *pet_scop_get_context(__isl_keep pet_scop *scop);
-/* Return the schedule of "scop". */
-__isl_give isl_schedule *pet_scop_get_schedule(__isl_keep pet_scop *scop);
-/* Return the set of all statement instances. */
-__isl_give isl_union_set *pet_scop_get_instance_set(__isl_keep pet_scop *scop);
-/* Return the potential read access relation. */
-__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop);
-/* Return the tagged potential read access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
-	__isl_keep pet_scop *scop);
-/* Return the potential write access relation. */
-__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop);
-/* Return the definite write access relation. */
-__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop);
-/* Return the tagged potential write access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
-	__isl_keep pet_scop *scop);
-/* Return the tagged definite write access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
-	__isl_keep pet_scop *scop);
-/* Return the definite kill access relation. */
-__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop);
-/* Return the tagged definite kill access relation. */
-__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
-	__isl_keep pet_scop *scop);
-
-/* Compute a mapping from all outermost arrays (of structs) in scop
- * to their innermost members.
- */
-__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
-	__isl_keep pet_scop *scop);
-/* Compute a mapping from all outermost arrays (of structs) in scop
- * to their members, including the outermost arrays themselves.
- */
-__isl_give isl_union_map *pet_scop_compute_outer_to_any(
-	__isl_keep pet_scop *scop);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/polly/lib/External/ppcg/ChangeLog
+++ b/polly/lib/External/ppcg/ChangeLog
@ -1,29 +0,0 @@
-version: 0.07
-date: Tue Feb  7 17:23:22 CET 2017
-changes:
-	- support hybrid tiling
---
-version: 0.06
-date: Fri May  6 12:08:50 CEST 2016
-changes:
-	- use PPCG specific macro names in generated code
-	- complete transition to schedule trees
-	- maximize coincidence by default
-	- map arrays with constant index expressions to private memory
-	- optionally group chains of statements
---
-version: 0.05
-date: Fri Jan 15 09:30:23 CET 2016
-changes:
-	- fix live-out computation
-	- optionally compute schedule for C target
-	- optionally perform tiling for C target
-	- create single kernel for non-permutable subtree
---
-version: 0.04
-date: Wed Jun 17 10:52:58 CEST 2015
-changes:
-	- use schedule trees
-	- fix live-range reordering
-	- improve generation of synchronization
-	- exploit independences during dependence analysis
--- a/polly/lib/External/ppcg/GIT_HEAD_ID
+++ b/polly/lib/External/ppcg/GIT_HEAD_ID
@ -1 +0,0 @@
-ppcg-0.07
--- a/polly/lib/External/ppcg/README
+++ b/polly/lib/External/ppcg/README
@ -1,246 +0,0 @@
-Requirements:
-
- automake, autoconf, libtool
-	(not needed when compiling a release)
- pkg-config (http://www.freedesktop.org/wiki/Software/pkg-config)
-	(not needed when compiling a release using the included isl and pet)
- gmp (http://gmplib.org/)
- libyaml (http://pyyaml.org/wiki/LibYAML)
-	(only needed if you want to compile the pet executable)
- LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html)
-	Unless you have some other reasons for wanting to use the svn version,
-	it is best to install the latest release (3.9).
-	For more details, see pet/README.
-
-If you are installing on Ubuntu, then you can install the following packages:
-
-automake autoconf libtool pkg-config libgmp3-dev libyaml-dev libclang-dev llvm
-
-Note that you need at least version 3.2 of libclang-dev (ubuntu raring).
-Older versions of this package did not include the required libraries.
-If you are using an older version of ubuntu, then you need to compile and
-install LLVM/clang from source.
-
-
-Preparing:
-
-Grab the latest release and extract it or get the source from
-the git repository as follows.  This process requires autoconf,
-automake, libtool and pkg-config.
-
-	git clone git://repo.or.cz/ppcg.git
-	cd ppcg
-	./get_submodules.sh
-	./autogen.sh
-
-
-Compilation:
-
-	./configure
-	make
-	make check
-
-If you have installed any of the required libraries in a non-standard
-location, then you may need to use the --with-gmp-prefix,
--with-libyaml-prefix and/or --with-clang-prefix options
-when calling "./configure".
-
-
-Using PPCG to generate CUDA or OpenCL code
-
-To convert a fragment of a C program to CUDA, insert a line containing
-
-	#pragma scop
-
-before the fragment and add a line containing
-
-	#pragma endscop
-
-after the fragment.  To generate CUDA code run
-	
-	ppcg --target=cuda file.c
-
-where file.c is the file containing the fragment.  The generated
-code is stored in file_host.cu and file_kernel.cu.
-
-To generate OpenCL code run
-
-	ppcg --target=opencl file.c
-
-where file.c is the file containing the fragment.  The generated code
-is stored in file_host.c and file_kernel.cl.
-
-
-Specifying tile, grid and block sizes
-
-The iterations space tile size, grid size and block size can
-be specified using the --sizes option.  The argument is a union map
-in isl notation mapping kernels identified by their sequence number
-in a "kernel" space to singleton sets in the "tile", "grid" and "block"
-spaces.  The sizes are specified outermost to innermost.
-
-The dimension of the "tile" space indicates the (maximal) number of loop
-dimensions to tile.  The elements of the single integer tuple
-specify the tile sizes in each dimension.
-In case of hybrid tiling, the first element is half the size of
-the tile in the time (sequential) dimension.  The second element
-specifies the number of elements in the base of the hexagon.
-The remaining elements specify the tile sizes in the remaining space
-dimensions.
-
-The dimension of the "grid" space indicates the (maximal) number of block
-dimensions in the grid.  The elements of the single integer tuple
-specify the number of blocks in each dimension.
-
-The dimension of the "block" space indicates the (maximal) number of thread
-dimensions in the grid.  The elements of the single integer tuple
-specify the number of threads in each dimension.
-
-For example,
-
-    { kernel[0] -> tile[64,64]; kernel[i] -> block[16] : i != 4 }
-
-specifies that in kernel 0, two loops should be tiled with a tile
-size of 64 in both dimensions and that all kernels except kernel 4
-should be run using a block of 16 threads.
-
-Since PPCG performs some scheduling, it can be difficult to predict
-what exactly will end up in a kernel.  If you want to specify
-tile, grid or block sizes, you may want to run PPCG first with the defaults,
-examine the kernels and then run PPCG again with the desired sizes.
-Instead of examining the kernels, you can also specify the option
--dump-sizes on the first run to obtain the effectively used default sizes.
-
-
-Compiling the generated CUDA code with nvcc
-
-To get optimal performance from nvcc, it is important to choose --arch
-according to your target GPU.  Specifically, use the flag "--arch sm_20"
-for fermi, "--arch sm_30" for GK10x Kepler and "--arch sm_35" for
-GK110 Kepler.  We discourage the use of older cards as we have seen
-correctness issues with compilation for older architectures.
-Note that in the absence of any --arch flag, nvcc defaults to
-"--arch sm_13". This will not only be slower, but can also cause
-correctness issues.
-If you want to obtain results that are identical to those obtained
-by the original code, then you may need to disable some optimizations
-by passing the "--fmad=false" option.
-
-
-Compiling the generated OpenCL code with gcc
-
-To compile the host code you need to link against the file
-ocl_utilities.c which contains utility functions used by the generated
-OpenCL host code.  To compile the host code with gcc, run
-
-  gcc -std=c99 file_host.c ocl_utilities.c -lOpenCL
-
-Note that we have experienced the generated OpenCL code freezing
-on some inputs (e.g., the PolyBench symm benchmark) when using
-at least some version of the Nvidia OpenCL library, while the
-corresponding CUDA code runs fine.
-We have experienced no such freezes when using AMD, ARM or Intel
-OpenCL libraries.
-
-By default, the compiled executable will need the _kernel.cl file at
-run time.  Alternatively, the option --opencl-embed-kernel-code may be
-given to place the kernel code in a string literal.  The kernel code is
-then compiled into the host binary, such that the _kernel.cl file is no
-longer needed at run time.  Any kernel include files, in particular
-those supplied using --opencl-include-file, will still be required at
-run time.
-
-
-Function calls
-
-Function calls inside the analyzed fragment are reproduced
-in the CUDA or OpenCL code, but for now it is left to the user
-to make sure that the functions that are being called are
-available from the generated kernels.
-
-In the case of OpenCL code, the --opencl-include-file option
-may be used to specify one or more files to be #include'd
-from the generated code.  These files may then contain
-the definitions of the functions being called from the
-program fragment.  If the pathnames of the included files
-are relative to the current directory, then you may need
-to additionally specify the --opencl-compiler-options=-I.
-to make sure that the files can be found by the OpenCL compiler.
-The included files may contain definitions of types used by the
-generated kernels.  By default, PPCG generates definitions for
-types as needed, but these definitions may collide with those in
-the included files, as PPCG does not consider the contents of the
-included files.  The --no-opencl-print-kernel-types will prevent
-PPCG from generating type definitions.
-
-
-GNU extensions
-
-By default, PPCG may print out macro definitions that involve
-GNU extensions such as __typeof__ and statement expressions.
-Some compilers may not support these extensions.
-In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918)
-has been reported not to support __typeof__.
-The use of these extensions can be turned off with the
--no-allow-gnu-extensions option.
-
-
-Processing PolyBench
-
-When processing a PolyBench/C 3.2 benchmark, you should always specify
-DPOLYBENCH_USE_C99_PROTO on the ppcg command line.  Otherwise, the source
-files are inconsistent, having fixed size arrays but parametrically
-bounded loops iterating over them.
-However, you should not specify this define when compiling
-the PPCG generated code using nvcc since CUDA does not support VLAs.
-
-
-CUDA and function overloading
-
-While CUDA supports function overloading based on the arguments types,
-no such function overloading exists in the input language C.  Since PPCG
-simply prints out the same function name as in the original code, this
-may result in a different function being called based on the types
-of the arguments.  For example, if the original code contains a call
-to the function sqrt() with a float argument, then the argument will
-be promoted to a double and the sqrt() function will be called.
-In the transformed (CUDA) code, however, overloading will cause the
-function sqrtf() to be called.  Until this issue has been resolved in PPCG,
-we recommend that users either explicitly call the function sqrtf() or
-explicitly cast the argument to double in the input code.
-
-
-Contact
-
-For bug reports, feature requests and questions,
-contact http://groups.google.com/group/isl-development
-
-Whenever you report a bug, please mention the exact version of PPCG
-that you are using (output of "./ppcg --version").  If you are unable
-to compile PPCG, then report the git version (output of "git describe")
-or the version number included in the name of the tarball.
-
-
-Citing PPCG
-
-If you use PPCG for your research, you are invited to cite
-the following paper.
-
-@article{Verdoolaege2013PPCG,
-    author = {Verdoolaege, Sven and Juega, Juan Carlos and Cohen, Albert and
-		G\'{o}mez, Jos{\'e} Ignacio and Tenllado, Christian and
-		Catthoor, Francky},
-    title = {Polyhedral parallel code generation for CUDA},
-    journal = {ACM Trans. Archit. Code Optim.},
-    issue_date = {January 2013},
-    volume = {9},
-    number = {4},
-    month = jan,
-    year = {2013},
-    issn = {1544-3566},
-    pages = {54:1--54:23},
-    doi = {10.1145/2400682.2400713},
-    acmid = {2400713},
-    publisher = {ACM},
-    address = {New York, NY, USA},
-}
--- a/polly/lib/External/ppcg/cpu.c
+++ b/polly/lib/External/ppcg/cpu.c
@ -1,802 +0,0 @@
-/*
- * Copyright 2012 INRIA Paris-Rocquencourt
- * Copyright 2012 Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Tobias Grosser, INRIA Paris-Rocquencourt,
- * Domaine de Voluceau, Rocquenqourt, B.P. 105,
- * 78153 Le Chesnay Cedex France
- * and Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <limits.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <isl/aff.h>
-#include <isl/ctx.h>
-#include <isl/flow.h>
-#include <isl/map.h>
-#include <isl/ast_build.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-#include <pet.h>
-
-#include "ppcg.h"
-#include "ppcg_options.h"
-#include "cpu.h"
-#include "print.h"
-#include "schedule.h"
-#include "util.h"
-
-/* Representation of a statement inside a generated AST.
- *
- * "stmt" refers to the original statement.
- * "ref2expr" maps the reference identifier of each access in
- * the statement to an AST expression that should be printed
- * at the place of the access.
- */
-struct ppcg_stmt {
-	struct pet_stmt *stmt;
-
-	isl_id_to_ast_expr *ref2expr;
-};
-
-static void ppcg_stmt_free(void *user)
-{
-	struct ppcg_stmt *stmt = user;
-
-	if (!stmt)
-		return;
-
-	isl_id_to_ast_expr_free(stmt->ref2expr);
-
-	free(stmt);
-}
-
-/* Derive the output file name from the input file name.
- * 'input' is the entire path of the input file. The output
- * is the file name plus the additional extension.
- *
- * We will basically replace everything after the last point
- * with '.ppcg.c'. This means file.c becomes file.ppcg.c
- */
-static FILE *get_output_file(const char *input, const char *output)
-{
-	char name[PATH_MAX];
-	const char *ext;
-	const char ppcg_marker[] = ".ppcg";
-	int len;
-	FILE *file;
-
-	len = ppcg_extract_base_name(name, input);
-
-	strcpy(name + len, ppcg_marker);
-	ext = strrchr(input, '.');
-	strcpy(name + len + sizeof(ppcg_marker) - 1, ext ? ext : ".c");
-
-	if (!output)
-		output = name;
-
-	file = fopen(output, "w");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for writing\n", output);
-		return NULL;
-	}
-
-	return file;
-}
-
-/* Data used to annotate for nodes in the ast.
- */
-struct ast_node_userinfo {
-	/* The for node is an openmp parallel for node. */
-	int is_openmp;
-};
-
-/* Information used while building the ast.
- */
-struct ast_build_userinfo {
-	/* The current ppcg scop. */
-	struct ppcg_scop *scop;
-
-	/* Are we currently in a parallel for loop? */
-	int in_parallel_for;
-};
-
-/* Check if the current scheduling dimension is parallel.
- *
- * We check for parallelism by verifying that the loop does not carry any
- * dependences.
- * If the live_range_reordering option is set, then this currently
- * includes the order dependences.  In principle, non-zero order dependences
- * could be allowed, but this would require privatization and/or expansion.
- *
- * Parallelism test: if the distance is zero in all outer dimensions, then it
- * has to be zero in the current dimension as well.
- * Implementation: first, translate dependences into time space, then force
- * outer dimensions to be equal.  If the distance is zero in the current
- * dimension, then the loop is parallel.
- * The distance is zero in the current dimension if it is a subset of a map
- * with equal values for the current dimension.
- */
-static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build,
-	struct ppcg_scop *scop)
-{
-	isl_union_map *schedule, *deps;
-	isl_map *schedule_deps, *test;
-	isl_space *schedule_space;
-	unsigned i, dimension, is_parallel;
-
-	schedule = isl_ast_build_get_schedule(build);
-	schedule_space = isl_ast_build_get_schedule_space(build);
-
-	dimension = isl_space_dim(schedule_space, isl_dim_out) - 1;
-
-	deps = isl_union_map_copy(scop->dep_flow);
-	deps = isl_union_map_union(deps, isl_union_map_copy(scop->dep_false));
-	if (scop->options->live_range_reordering) {
-		isl_union_map *order = isl_union_map_copy(scop->dep_order);
-		deps = isl_union_map_union(deps, order);
-	}
-	deps = isl_union_map_apply_range(deps, isl_union_map_copy(schedule));
-	deps = isl_union_map_apply_domain(deps, schedule);
-
-	if (isl_union_map_is_empty(deps)) {
-		isl_union_map_free(deps);
-		isl_space_free(schedule_space);
-		return 1;
-	}
-
-	schedule_deps = isl_map_from_union_map(deps);
-
-	for (i = 0; i < dimension; i++)
-		schedule_deps = isl_map_equate(schedule_deps, isl_dim_out, i,
-					       isl_dim_in, i);
-
-	test = isl_map_universe(isl_map_get_space(schedule_deps));
-	test = isl_map_equate(test, isl_dim_out, dimension, isl_dim_in,
-			      dimension);
-	is_parallel = isl_map_is_subset(schedule_deps, test);
-
-	isl_space_free(schedule_space);
-	isl_map_free(test);
-	isl_map_free(schedule_deps);
-
-	return is_parallel;
-}
-
-/* Mark a for node openmp parallel, if it is the outermost parallel for node.
- */
-static void mark_openmp_parallel(__isl_keep isl_ast_build *build,
-	struct ast_build_userinfo *build_info,
-	struct ast_node_userinfo *node_info)
-{
-	if (build_info->in_parallel_for)
-		return;
-
-	if (ast_schedule_dim_is_parallel(build, build_info->scop)) {
-		build_info->in_parallel_for = 1;
-		node_info->is_openmp = 1;
-	}
-}
-
-/* Allocate an ast_node_info structure and initialize it with default values.
- */
-static struct ast_node_userinfo *allocate_ast_node_userinfo()
-{
-	struct ast_node_userinfo *node_info;
-	node_info = (struct ast_node_userinfo *)
-		malloc(sizeof(struct ast_node_userinfo));
-	node_info->is_openmp = 0;
-	return node_info;
-}
-
-/* Free an ast_node_info structure.
- */
-static void free_ast_node_userinfo(void *ptr)
-{
-	struct ast_node_userinfo *info;
-	info = (struct ast_node_userinfo *) ptr;
-	free(info);
-}
-
-/* This method is executed before the construction of a for node. It creates
- * an isl_id that is used to annotate the subsequently generated ast for nodes.
- *
- * In this function we also run the following analyses:
- *
- * 	- Detection of openmp parallel loops
- */
-static __isl_give isl_id *ast_build_before_for(
-	__isl_keep isl_ast_build *build, void *user)
-{
-	isl_id *id;
-	struct ast_build_userinfo *build_info;
-	struct ast_node_userinfo *node_info;
-
-	build_info = (struct ast_build_userinfo *) user;
-	node_info = allocate_ast_node_userinfo();
-	id = isl_id_alloc(isl_ast_build_get_ctx(build), "", node_info);
-	id = isl_id_set_free_user(id, free_ast_node_userinfo);
-
-	mark_openmp_parallel(build, build_info, node_info);
-
-	return id;
-}
-
-/* This method is executed after the construction of a for node.
- *
- * It performs the following actions:
- *
- * 	- Reset the 'in_parallel_for' flag, as soon as we leave a for node,
- * 	  that is marked as openmp parallel.
- *
- */
-static __isl_give isl_ast_node *ast_build_after_for(
-	__isl_take isl_ast_node *node, __isl_keep isl_ast_build *build,
-	void *user)
-{
-	isl_id *id;
-	struct ast_build_userinfo *build_info;
-	struct ast_node_userinfo *info;
-
-	id = isl_ast_node_get_annotation(node);
-	info = isl_id_get_user(id);
-
-	if (info && info->is_openmp) {
-		build_info = (struct ast_build_userinfo *) user;
-		build_info->in_parallel_for = 0;
-	}
-
-	isl_id_free(id);
-
-	return node;
-}
-
-/* Find the element in scop->stmts that has the given "id".
- */
-static struct pet_stmt *find_stmt(struct ppcg_scop *scop, __isl_keep isl_id *id)
-{
-	int i;
-
-	for (i = 0; i < scop->pet->n_stmt; ++i) {
-		struct pet_stmt *stmt = scop->pet->stmts[i];
-		isl_id *id_i;
-
-		id_i = isl_set_get_tuple_id(stmt->domain);
-		isl_id_free(id_i);
-
-		if (id_i == id)
-			return stmt;
-	}
-
-	isl_die(isl_id_get_ctx(id), isl_error_internal,
-		"statement not found", return NULL);
-}
-
-/* Print a user statement in the generated AST.
- * The ppcg_stmt has been attached to the node in at_each_domain.
- */
-static __isl_give isl_printer *print_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	struct ppcg_stmt *stmt;
-	isl_id *id;
-
-	id = isl_ast_node_get_annotation(node);
-	stmt = isl_id_get_user(id);
-	isl_id_free(id);
-
-	p = pet_stmt_print_body(stmt->stmt, p, stmt->ref2expr);
-
-	isl_ast_print_options_free(print_options);
-
-	return p;
-}
-
-
-/* Print a for loop node as an openmp parallel loop.
- *
- * To print an openmp parallel loop we print a normal for loop, but add
- * "#pragma openmp parallel for" in front.
- *
- * Variables that are declared within the body of this for loop are
- * automatically openmp 'private'. Iterators declared outside of the
- * for loop are automatically openmp 'shared'. As ppcg declares all iterators
- * at the position where they are assigned, there is no need to explicitly mark
- * variables. Their automatically assigned type is already correct.
- *
- * This function only generates valid OpenMP code, if the ast was generated
- * with the 'atomic-bounds' option enabled.
- *
- */
-static __isl_give isl_printer *print_for_with_openmp(
-	__isl_keep isl_ast_node *node, __isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "#pragma omp parallel for");
-	p = isl_printer_end_line(p);
-
-	p = isl_ast_node_for_print(node, p, print_options);
-
-	return p;
-}
-
-/* Print a for node.
- *
- * Depending on how the node is annotated, we either print a normal
- * for node or an openmp parallel for node.
- */
-static __isl_give isl_printer *print_for(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	isl_id *id;
-	int openmp;
-
-	openmp = 0;
-	id = isl_ast_node_get_annotation(node);
-
-	if (id) {
-		struct ast_node_userinfo *info;
-
-		info = (struct ast_node_userinfo *) isl_id_get_user(id);
-		if (info && info->is_openmp)
-			openmp = 1;
-	}
-
-	if (openmp)
-		p = print_for_with_openmp(node, p, print_options);
-	else
-		p = isl_ast_node_for_print(node, p, print_options);
-
-	isl_id_free(id);
-
-	return p;
-}
-
-/* Index transformation callback for pet_stmt_build_ast_exprs.
- *
- * "index" expresses the array indices in terms of statement iterators
- * "iterator_map" expresses the statement iterators in terms of
- * AST loop iterators.
- *
- * The result expresses the array indices in terms of
- * AST loop iterators.
- */
-static __isl_give isl_multi_pw_aff *pullback_index(
-	__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *id, void *user)
-{
-	isl_pw_multi_aff *iterator_map = user;
-
-	iterator_map = isl_pw_multi_aff_copy(iterator_map);
-	return isl_multi_pw_aff_pullback_pw_multi_aff(index, iterator_map);
-}
-
-/* Transform the accesses in the statement associated to the domain
- * called by "node" to refer to the AST loop iterators, construct
- * corresponding AST expressions using "build",
- * collect them in a ppcg_stmt and annotate the node with the ppcg_stmt.
- */
-static __isl_give isl_ast_node *at_each_domain(__isl_take isl_ast_node *node,
-	__isl_keep isl_ast_build *build, void *user)
-{
-	struct ppcg_scop *scop = user;
-	isl_ast_expr *expr, *arg;
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_map *map;
-	isl_pw_multi_aff *iterator_map;
-	struct ppcg_stmt *stmt;
-
-	ctx = isl_ast_node_get_ctx(node);
-	stmt = isl_calloc_type(ctx, struct ppcg_stmt);
-	if (!stmt)
-		goto error;
-
-	expr = isl_ast_node_user_get_expr(node);
-	arg = isl_ast_expr_get_op_arg(expr, 0);
-	isl_ast_expr_free(expr);
-	id = isl_ast_expr_get_id(arg);
-	isl_ast_expr_free(arg);
-	stmt->stmt = find_stmt(scop, id);
-	isl_id_free(id);
-	if (!stmt->stmt)
-		goto error;
-
-	map = isl_map_from_union_map(isl_ast_build_get_schedule(build));
-	map = isl_map_reverse(map);
-	iterator_map = isl_pw_multi_aff_from_map(map);
-	stmt->ref2expr = pet_stmt_build_ast_exprs(stmt->stmt, build,
-				    &pullback_index, iterator_map, NULL, NULL);
-	isl_pw_multi_aff_free(iterator_map);
-
-	id = isl_id_alloc(isl_ast_node_get_ctx(node), NULL, stmt);
-	id = isl_id_set_free_user(id, &ppcg_stmt_free);
-	return isl_ast_node_set_annotation(node, id);
-error:
-	ppcg_stmt_free(stmt);
-	return isl_ast_node_free(node);
-}
-
-/* Set *depth (initialized to 0 by the caller) to the maximum
- * of the schedule depths of the leaf nodes for which this function is called.
- */
-static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
-{
-	int *depth = user;
-	int node_depth;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
-		return isl_bool_true;
-	node_depth = isl_schedule_node_get_schedule_depth(node);
-	if (node_depth > *depth)
-		*depth = node_depth;
-
-	return isl_bool_false;
-}
-
-/* This function is called for each node in a CPU AST.
- * In case of a user node, print the macro definitions required
- * for printing the AST expressions in the annotation, if any.
- * For other nodes, return true such that descendants are also
- * visited.
- *
- * In particular, print the macro definitions needed for the substitutions
- * of the original user statements.
- */
-static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
-{
-	struct ppcg_stmt *stmt;
-	isl_id *id;
-	isl_printer **p = user;
-
-	if (isl_ast_node_get_type(node) != isl_ast_node_user)
-		return isl_bool_true;
-
-	id = isl_ast_node_get_annotation(node);
-	stmt = isl_id_get_user(id);
-	isl_id_free(id);
-
-	if (!stmt)
-		return isl_bool_error;
-
-	*p = ppcg_print_body_macros(*p, stmt->ref2expr);
-	if (!*p)
-		return isl_bool_error;
-
-	return isl_bool_false;
-}
-
-/* Print the required macros for the CPU AST "node" to "p",
- * including those needed for the user statements inside the AST.
- */
-static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node)
-{
-	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
-		return isl_printer_free(p);
-	p = ppcg_print_macros(p, node);
-	return p;
-}
-
-/* Code generate the scop 'scop' using "schedule"
- * and print the corresponding C code to 'p'.
- */
-static __isl_give isl_printer *print_scop(struct ppcg_scop *scop,
-	__isl_take isl_schedule *schedule, __isl_take isl_printer *p,
-	struct ppcg_options *options)
-{
-	isl_ctx *ctx = isl_printer_get_ctx(p);
-	isl_ast_build *build;
-	isl_ast_print_options *print_options;
-	isl_ast_node *tree;
-	isl_id_list *iterators;
-	struct ast_build_userinfo build_info;
-	int depth;
-
-	depth = 0;
-	if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
-						&depth) < 0)
-		goto error;
-
-	build = isl_ast_build_alloc(ctx);
-	iterators = ppcg_scop_generate_names(scop, depth, "c");
-	build = isl_ast_build_set_iterators(build, iterators);
-	build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop);
-
-	if (options->openmp) {
-		build_info.scop = scop;
-		build_info.in_parallel_for = 0;
-
-		build = isl_ast_build_set_before_each_for(build,
-							&ast_build_before_for,
-							&build_info);
-		build = isl_ast_build_set_after_each_for(build,
-							&ast_build_after_for,
-							&build_info);
-	}
-
-	tree = isl_ast_build_node_from_schedule(build, schedule);
-	isl_ast_build_free(build);
-
-	print_options = isl_ast_print_options_alloc(ctx);
-	print_options = isl_ast_print_options_set_print_user(print_options,
-							&print_user, NULL);
-
-	print_options = isl_ast_print_options_set_print_for(print_options,
-							&print_for, NULL);
-
-	p = cpu_print_macros(p, tree);
-	p = isl_ast_node_print(tree, p, print_options);
-
-	isl_ast_node_free(tree);
-
-	return p;
-error:
-	isl_schedule_free(schedule);
-	isl_printer_free(p);
-	return NULL;
-}
-
-/* Tile the band node "node" with tile sizes "sizes" and
- * mark all members of the resulting tile node as "atomic".
- */
-static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node,
-	__isl_take isl_multi_val *sizes)
-{
-	node = isl_schedule_node_band_tile(node, sizes);
-	node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
-
-	return node;
-}
-
-/* Tile "node", if it is a band node with at least 2 members.
- * The tile sizes are set from the "tile_size" option.
- */
-static __isl_give isl_schedule_node *tile_band(
-	__isl_take isl_schedule_node *node, void *user)
-{
-	struct ppcg_scop *scop = user;
-	int n;
-	isl_space *space;
-	isl_multi_val *sizes;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		return node;
-
-	n = isl_schedule_node_band_n_member(node);
-	if (n <= 1)
-		return node;
-
-	space = isl_schedule_node_band_get_space(node);
-	sizes = ppcg_multi_val_from_int(space, scop->options->tile_size);
-
-	return tile(node, sizes);
-}
-
-/* Construct schedule constraints from the dependences in ps
- * for the purpose of computing a schedule for a CPU.
- *
- * The proximity constraints are set to the flow dependences.
- *
- * If live-range reordering is allowed then the conditional validity
- * constraints are set to the order dependences with the flow dependences
- * as condition.  That is, a live-range (flow dependence) will be either
- * local to an iteration of a band or all adjacent order dependences
- * will be respected by the band.
- * The validity constraints are set to the union of the flow dependences
- * and the forced dependences, while the coincidence constraints
- * are set to the union of the flow dependences, the forced dependences and
- * the order dependences.
- *
- * If live-range reordering is not allowed, then both the validity
- * and the coincidence constraints are set to the union of the flow
- * dependences and the false dependences.
- *
- * Note that the coincidence constraints are only set when the "openmp"
- * options is set.  Even though the way openmp pragmas are introduced
- * does not rely on the coincident property of the schedule band members,
- * the coincidence constraints do affect the way the schedule is constructed,
- * such that more schedule dimensions should be detected as parallel
- * by ast_schedule_dim_is_parallel.
- * Since the order dependences are also taken into account by
- * ast_schedule_dim_is_parallel, they are also added to
- * the coincidence constraints.  If the openmp handling learns
- * how to privatize some memory, then the corresponding order
- * dependences can be removed from the coincidence constraints.
- */
-static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints(
-	struct ppcg_scop *ps)
-{
-	isl_schedule_constraints *sc;
-	isl_union_map *validity, *coincidence;
-
-	sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain));
-	if (ps->options->live_range_reordering) {
-		sc = isl_schedule_constraints_set_conditional_validity(sc,
-				isl_union_map_copy(ps->tagged_dep_flow),
-				isl_union_map_copy(ps->tagged_dep_order));
-		validity = isl_union_map_copy(ps->dep_flow);
-		validity = isl_union_map_union(validity,
-				isl_union_map_copy(ps->dep_forced));
-		if (ps->options->openmp) {
-			coincidence = isl_union_map_copy(validity);
-			coincidence = isl_union_map_union(coincidence,
-					isl_union_map_copy(ps->dep_order));
-		}
-	} else {
-		validity = isl_union_map_copy(ps->dep_flow);
-		validity = isl_union_map_union(validity,
-				isl_union_map_copy(ps->dep_false));
-		if (ps->options->openmp)
-			coincidence = isl_union_map_copy(validity);
-	}
-	if (ps->options->openmp)
-		sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
-	sc = isl_schedule_constraints_set_validity(sc, validity);
-	sc = isl_schedule_constraints_set_proximity(sc,
-					isl_union_map_copy(ps->dep_flow));
-
-	return sc;
-}
-
-/* Compute a schedule for the scop "ps".
- *
- * First derive the appropriate schedule constraints from the dependences
- * in "ps" and then compute a schedule from those schedule constraints,
- * possibly grouping statement instances based on the input schedule.
- */
-static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps)
-{
-	isl_schedule_constraints *sc;
-	isl_schedule *schedule;
-
-	if (!ps)
-		return NULL;
-
-	sc = construct_cpu_schedule_constraints(ps);
-
-	if (ps->options->debug->dump_schedule_constraints)
-		isl_schedule_constraints_dump(sc);
-	schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options);
-
-	return schedule;
-}
-
-/* Compute a new schedule to the scop "ps" if the reschedule option is set.
- * Otherwise, return a copy of the original schedule.
- */
-static __isl_give isl_schedule *optionally_compute_schedule(void *user)
-{
-	struct ppcg_scop *ps = user;
-
-	if (!ps)
-		return NULL;
-	if (!ps->options->reschedule)
-		return isl_schedule_copy(ps->schedule);
-	return compute_cpu_schedule(ps);
-}
-
-/* Compute a schedule based on the dependences in "ps" and
- * tile it if requested by the user.
- */
-static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps,
-	struct ppcg_options *options)
-{
-	isl_ctx *ctx;
-	isl_schedule *schedule;
-
-	if (!ps)
-		return NULL;
-
-	ctx = isl_union_set_get_ctx(ps->domain);
-	schedule = ppcg_get_schedule(ctx, options,
-				    &optionally_compute_schedule, ps);
-	if (ps->options->tile)
-		schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
-							&tile_band, ps);
-
-	return schedule;
-}
-
-/* Generate CPU code for the scop "ps" using "schedule" and
- * print the corresponding C code to "p", including variable declarations.
- */
-static __isl_give isl_printer *print_cpu_with_schedule(
-	__isl_take isl_printer *p, struct ppcg_scop *ps,
-	__isl_take isl_schedule *schedule, struct ppcg_options *options)
-{
-	int hidden;
-	isl_set *context;
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "/* ppcg generated CPU code */");
-	p = isl_printer_end_line(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-
-	p = ppcg_set_macro_names(p);
-	p = ppcg_print_exposed_declarations(p, ps);
-	hidden = ppcg_scop_any_hidden_declarations(ps);
-	if (hidden) {
-		p = ppcg_start_block(p);
-		p = ppcg_print_hidden_declarations(p, ps);
-	}
-
-	context = isl_set_copy(ps->context);
-	context = isl_set_from_params(context);
-	schedule = isl_schedule_insert_context(schedule, context);
-	if (options->debug->dump_final_schedule)
-		isl_schedule_dump(schedule);
-	p = print_scop(ps, schedule, p, options);
-	if (hidden)
-		p = ppcg_end_block(p);
-
-	return p;
-}
-
-/* Generate CPU code for the scop "ps" and print the corresponding C code
- * to "p", including variable declarations.
- */
-__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
-	struct ppcg_scop *ps, struct ppcg_options *options)
-{
-	isl_schedule *schedule;
-
-	schedule = isl_schedule_copy(ps->schedule);
-	return print_cpu_with_schedule(p, ps, schedule, options);
-}
-
-/* Generate CPU code for "scop" and print it to "p".
- *
- * First obtain a schedule for "scop" and then print code for "scop"
- * using that schedule.
- */
-static __isl_give isl_printer *generate(__isl_take isl_printer *p,
-	struct ppcg_scop *scop, struct ppcg_options *options)
-{
-	isl_schedule *schedule;
-
-	schedule = get_schedule(scop, options);
-
-	return print_cpu_with_schedule(p, scop, schedule, options);
-}
-
-/* Wrapper around generate for use as a ppcg_transform callback.
- */
-static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p,
-	struct ppcg_scop *scop, void *user)
-{
-	struct ppcg_options *options = user;
-
-	return generate(p, scop, options);
-}
-
-/* Transform the code in the file called "input" by replacing
- * all scops by corresponding CPU code and write the results to a file
- * called "output".
- */
-int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input, const char *output)
-{
-	FILE *output_file;
-	int r;
-
-	output_file = get_output_file(input, output);
-	if (!output_file)
-		return -1;
-
-	r = ppcg_transform(ctx, input, output_file, options,
-					&print_cpu_wrap, options);
-
-	fclose(output_file);
-
-	return r;
-}
--- a/polly/lib/External/ppcg/cpu.h
+++ b/polly/lib/External/ppcg/cpu.h
@ -1,15 +0,0 @@
-#ifndef _CPU_H
-#define _CPU_H
-
-#include <isl/ctx.h>
-
-#include "ppcg.h"
-
-struct ppcg_options;
-
-__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
-	struct ppcg_scop *ps, struct ppcg_options *options);
-int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input, const char *output);
-
-#endif
--- a/polly/lib/External/ppcg/cuda.c
+++ b/polly/lib/External/ppcg/cuda.c
@ -1,730 +0,0 @@
-/*
- * Copyright 2012      Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
- */
-
-#include <isl/aff.h>
-#include <isl/ast.h>
-
-#include "cuda_common.h"
-#include "cuda.h"
-#include "gpu.h"
-#include "gpu_print.h"
-#include "print.h"
-#include "util.h"
-
-static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
-{
-	const char *macros =
-		"#define cudaCheckReturn(ret) \\\n"
-		"  do { \\\n"
-		"    cudaError_t cudaCheckReturn_e = (ret); \\\n"
-		"    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
-		"      fprintf(stderr, \"CUDA error: %s\\n\", "
-		"cudaGetErrorString(cudaCheckReturn_e)); \\\n"
-		"      fflush(stderr); \\\n"
-		"    } \\\n"
-		"    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
-		"  } while(0)\n"
-		"#define cudaCheckKernel() \\\n"
-		"  do { \\\n"
-		"    cudaCheckReturn(cudaGetLastError()); \\\n"
-		"  } while(0)\n\n";
-
-	p = isl_printer_print_str(p, macros);
-	return p;
-}
-
-/* Print a declaration for the device array corresponding to "array" on "p".
- */
-static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
-	struct gpu_array_info *array)
-{
-	int i;
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, array->type);
-	p = isl_printer_print_str(p, " ");
-	if (!array->linearize && array->n_index > 1)
-		p = isl_printer_print_str(p, "(");
-	p = isl_printer_print_str(p, "*dev_");
-	p = isl_printer_print_str(p, array->name);
-	if (!array->linearize && array->n_index > 1) {
-		p = isl_printer_print_str(p, ")");
-		for (i = 1; i < array->n_index; i++) {
-			isl_ast_expr *bound;
-			bound = isl_ast_expr_get_op_arg(array->bound_expr,
-							1 + i);
-			p = isl_printer_print_str(p, "[");
-			p = isl_printer_print_ast_expr(p, bound);
-			p = isl_printer_print_str(p, "]");
-			isl_ast_expr_free(bound);
-		}
-	}
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		if (!gpu_array_requires_device_allocation(&prog->array[i]))
-			continue;
-
-		p = declare_device_array(p, &prog->array[i]);
-	}
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-	return p;
-}
-
-static __isl_give isl_printer *allocate_device_arrays(
-	__isl_take isl_printer *p, struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-
-		if (!gpu_array_requires_device_allocation(&prog->array[i]))
-			continue;
-		p = ppcg_ast_expr_print_macros(array->bound_expr, p);
-		p = isl_printer_start_line(p);
-		p = isl_printer_print_str(p,
-			"cudaCheckReturn(cudaMalloc((void **) &dev_");
-		p = isl_printer_print_str(p, prog->array[i].name);
-		p = isl_printer_print_str(p, ", ");
-		p = gpu_array_info_print_size(p, &prog->array[i]);
-		p = isl_printer_print_str(p, "));");
-		p = isl_printer_end_line(p);
-	}
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-	return p;
-}
-
-static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		if (!gpu_array_requires_device_allocation(&prog->array[i]))
-			continue;
-		p = isl_printer_start_line(p);
-		p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
-		p = isl_printer_print_str(p, prog->array[i].name);
-		p = isl_printer_print_str(p, "));");
-		p = isl_printer_end_line(p);
-	}
-
-	return p;
-}
-
-/* Print code to "p" for copying "array" from the host to the device
- * in its entirety.  The bounds on the extent of "array" have
- * been precomputed in extract_array_info and are used in
- * gpu_array_info_print_size.
- */
-static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
-	struct gpu_array_info *array)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", ");
-
-	if (gpu_array_is_scalar(array))
-		p = isl_printer_print_str(p, "&");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", ");
-
-	p = gpu_array_info_print_size(p, array);
-	p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Print code to "p" for copying "array" back from the device to the host
- * in its entirety.  The bounds on the extent of "array" have
- * been precomputed in extract_array_info and are used in
- * gpu_array_info_print_size.
- */
-static __isl_give isl_printer *copy_array_from_device(
-	__isl_take isl_printer *p, struct gpu_array_info *array)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
-	if (gpu_array_is_scalar(array))
-		p = isl_printer_print_str(p, "&");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", dev_");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, ", ");
-	p = gpu_array_info_print_size(p, array);
-	p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-static __isl_give isl_printer* print_reverse_list(__isl_take isl_printer *p, int len, int *list)
-{
-	int i;
-
-	if (len == 0)
-		return p;
-
-	p = isl_printer_print_str(p, "(");
-	for (i = 0; i < len; ++i) {
-		if (i)
-			p = isl_printer_print_str(p, ", ");
-		p = isl_printer_print_int(p, list[len - 1 - i]);
-	}
-	return isl_printer_print_str(p, ")");
-}
-
-/* Print the effective grid size as a list of the sizes in each
- * dimension, from innermost to outermost.
- */
-static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
-	struct ppcg_kernel *kernel)
-{
-	int i;
-	int dim;
-
-	dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
-	if (dim == 0)
-		return p;
-
-	p = isl_printer_print_str(p, "(");
-	for (i = dim - 1; i >= 0; --i) {
-		isl_ast_expr *bound;
-
-		bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
-		p = isl_printer_print_ast_expr(p, bound);
-		isl_ast_expr_free(bound);
-
-		if (i > 0)
-			p = isl_printer_print_str(p, ", ");
-	}
-
-	p = isl_printer_print_str(p, ")");
-
-	return p;
-}
-
-/* Print the grid definition.
- */
-static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
-	struct ppcg_kernel *kernel)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "dim3 k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimGrid");
-	p = print_grid_size(p, kernel);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Print the arguments to a kernel declaration or call.  If "types" is set,
- * then print a declaration (including the types of the arguments).
- *
- * The arguments are printed in the following order
- * - the arrays accessed by the kernel
- * - the parameters
- * - the host loop iterators
- */
-static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
-	struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
-{
-	int i, n;
-	int first = 1;
-	unsigned nparam;
-	isl_space *space;
-	const char *type;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		int required;
-
-		required = ppcg_kernel_requires_array_argument(kernel, i);
-		if (required < 0)
-			return isl_printer_free(p);
-		if (!required)
-			continue;
-
-		if (!first)
-			p = isl_printer_print_str(p, ", ");
-
-		if (types)
-			p = gpu_array_info_print_declaration_argument(p,
-				&prog->array[i], NULL);
-		else
-			p = gpu_array_info_print_call_argument(p,
-				&prog->array[i]);
-
-		first = 0;
-	}
-
-	space = isl_union_set_get_space(kernel->arrays);
-	nparam = isl_space_dim(space, isl_dim_param);
-	for (i = 0; i < nparam; ++i) {
-		const char *name;
-
-		name = isl_space_get_dim_name(space, isl_dim_param, i);
-
-		if (!first)
-			p = isl_printer_print_str(p, ", ");
-		if (types)
-			p = isl_printer_print_str(p, "int ");
-		p = isl_printer_print_str(p, name);
-
-		first = 0;
-	}
-	isl_space_free(space);
-
-	n = isl_space_dim(kernel->space, isl_dim_set);
-	type = isl_options_get_ast_iterator_type(prog->ctx);
-	for (i = 0; i < n; ++i) {
-		const char *name;
-
-		if (!first)
-			p = isl_printer_print_str(p, ", ");
-		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
-		if (types) {
-			p = isl_printer_print_str(p, type);
-			p = isl_printer_print_str(p, " ");
-		}
-		p = isl_printer_print_str(p, name);
-
-		first = 0;
-	}
-
-	return p;
-}
-
-/* Print the header of the given kernel.
- */
-static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
-	struct gpu_prog *prog, struct ppcg_kernel *kernel)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "__global__ void kernel");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "(");
-	p = print_kernel_arguments(p, prog, kernel, 1);
-	p = isl_printer_print_str(p, ")");
-
-	return p;
-}
-
-/* Print the header of the given kernel to both gen->cuda.kernel_h
- * and gen->cuda.kernel_c.
- */
-static void print_kernel_headers(struct gpu_prog *prog,
-	struct ppcg_kernel *kernel, struct cuda_info *cuda)
-{
-	isl_printer *p;
-
-	p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
-	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
-	p = print_kernel_header(p, prog, kernel);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-	isl_printer_free(p);
-
-	p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
-	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
-	p = print_kernel_header(p, prog, kernel);
-	p = isl_printer_end_line(p);
-	isl_printer_free(p);
-}
-
-static void print_indent(FILE *dst, int indent)
-{
-	fprintf(dst, "%*s", indent, "");
-}
-
-/* Print a list of iterators of type "type" with names "ids" to "out".
- * Each iterator is assigned one of the cuda identifiers in cuda_dims.
- * In particular, the last iterator is assigned the x identifier
- * (the first in the list of cuda identifiers).
- */
-static void print_iterators(FILE *out, const char *type,
-	__isl_keep isl_id_list *ids, const char *cuda_dims[])
-{
-	int i, n;
-
-	n = isl_id_list_n_id(ids);
-	if (n <= 0)
-		return;
-	print_indent(out, 4);
-	fprintf(out, "%s ", type);
-	for (i = 0; i < n; ++i) {
-		isl_id *id;
-
-		if (i)
-			fprintf(out, ", ");
-		id = isl_id_list_get_id(ids, i);
-		fprintf(out, "%s = %s", isl_id_get_name(id),
-			cuda_dims[n - 1 - i]);
-		isl_id_free(id);
-	}
-	fprintf(out, ";\n");
-}
-
-static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
-{
-	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
-	const char *type;
-	const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
-	const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
-					"threadIdx.z" };
-
-	type = isl_options_get_ast_iterator_type(ctx);
-
-	print_iterators(out, type, kernel->block_ids, block_dims);
-	print_iterators(out, type, kernel->thread_ids, thread_dims);
-}
-
-static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
-	struct ppcg_kernel_var *var)
-{
-	int j;
-
-	p = isl_printer_start_line(p);
-	if (var->type == ppcg_access_shared)
-		p = isl_printer_print_str(p, "__shared__ ");
-	p = isl_printer_print_str(p, var->array->type);
-	p = isl_printer_print_str(p, " ");
-	p = isl_printer_print_str(p,  var->name);
-	for (j = 0; j < var->array->n_index; ++j) {
-		isl_val *v;
-
-		p = isl_printer_print_str(p, "[");
-		v = isl_vec_get_element_val(var->size, j);
-		p = isl_printer_print_val(p, v);
-		isl_val_free(v);
-		p = isl_printer_print_str(p, "]");
-	}
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
-	struct ppcg_kernel *kernel)
-{
-	int i;
-
-	for (i = 0; i < kernel->n_var; ++i)
-		p = print_kernel_var(p, &kernel->var[i]);
-
-	return p;
-}
-
-/* Print a sync statement.
- */
-static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "__syncthreads();");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* This function is called for each user statement in the AST,
- * i.e., for each kernel body statement, copy statement or sync statement.
- */
-static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	isl_id *id;
-	struct ppcg_kernel_stmt *stmt;
-
-	id = isl_ast_node_get_annotation(node);
-	stmt = isl_id_get_user(id);
-	isl_id_free(id);
-
-	isl_ast_print_options_free(print_options);
-
-	switch (stmt->type) {
-	case ppcg_kernel_copy:
-		return ppcg_kernel_print_copy(p, stmt);
-	case ppcg_kernel_sync:
-		return print_sync(p, stmt);
-	case ppcg_kernel_domain:
-		return ppcg_kernel_print_domain(p, stmt);
-	}
-
-	return p;
-}
-
-static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
-	struct cuda_info *cuda)
-{
-	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
-	isl_ast_print_options *print_options;
-	isl_printer *p;
-
-	print_kernel_headers(prog, kernel, cuda);
-	fprintf(cuda->kernel_c, "{\n");
-	print_kernel_iterators(cuda->kernel_c, kernel);
-
-	p = isl_printer_to_file(ctx, cuda->kernel_c);
-	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
-	p = isl_printer_indent(p, 4);
-
-	p = print_kernel_vars(p, kernel);
-	p = isl_printer_end_line(p);
-	p = ppcg_set_macro_names(p);
-	p = gpu_print_macros(p, kernel->tree);
-
-	print_options = isl_ast_print_options_alloc(ctx);
-	print_options = isl_ast_print_options_set_print_user(print_options,
-							&print_kernel_stmt, NULL);
-	p = isl_ast_node_print(kernel->tree, p, print_options);
-	isl_printer_free(p);
-
-	fprintf(cuda->kernel_c, "}\n");
-}
-
-/* Print code for initializing the device for execution of the transformed
- * code.  This includes declaring locally defined variables as well as
- * declaring and allocating the required copies of arrays on the device.
- */
-static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	p = print_cuda_macros(p);
-
-	p = gpu_print_local_declarations(p, prog);
-	p = declare_device_arrays(p, prog);
-	p = allocate_device_arrays(p, prog);
-
-	return p;
-}
-
-/* Print code for clearing the device after execution of the transformed code.
- * In particular, free the memory that was allocated on the device.
- */
-static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	p = free_device_arrays(p, prog);
-
-	return p;
-}
-
-/* Print a statement for copying an array to or from the device,
- * or for initializing or clearing the device.
- * The statement identifier of a copying node is called
- * "to_device_<array name>" or "from_device_<array name>" and
- * its user pointer points to the gpu_array_info of the array
- * that needs to be copied.
- * The node for initializing the device is called "init_device".
- * The node for clearing the device is called "clear_device".
- *
- * Extract the array (if any) from the identifier and call
- * init_device, clear_device, copy_array_to_device or copy_array_from_device.
- */
-static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node, struct gpu_prog *prog)
-{
-	isl_ast_expr *expr, *arg;
-	isl_id *id;
-	const char *name;
-	struct gpu_array_info *array;
-
-	expr = isl_ast_node_user_get_expr(node);
-	arg = isl_ast_expr_get_op_arg(expr, 0);
-	id = isl_ast_expr_get_id(arg);
-	name = isl_id_get_name(id);
-	array = isl_id_get_user(id);
-	isl_id_free(id);
-	isl_ast_expr_free(arg);
-	isl_ast_expr_free(expr);
-
-	if (!name)
-		return isl_printer_free(p);
-	if (!strcmp(name, "init_device"))
-		return init_device(p, prog);
-	if (!strcmp(name, "clear_device"))
-		return clear_device(p, prog);
-	if (!array)
-		return isl_printer_free(p);
-
-	if (!prefixcmp(name, "to_device"))
-		return copy_array_to_device(p, array);
-	else
-		return copy_array_from_device(p, array);
-}
-
-struct print_host_user_data {
-	struct cuda_info *cuda;
-	struct gpu_prog *prog;
-};
-
-/* Print the user statement of the host code to "p".
- *
- * The host code may contain original user statements, kernel launches,
- * statements that copy data to/from the device and statements
- * the initialize or clear the device.
- * The original user statements and the kernel launches have
- * an associated annotation, while the other statements do not.
- * The latter are handled by print_device_node.
- * The annotation on the user statements is called "user".
- *
- * In case of a kernel launch, print a block of statements that
- * defines the grid and the block and then launches the kernel.
- */
-__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	isl_id *id;
-	int is_user;
-	struct ppcg_kernel *kernel;
-	struct ppcg_kernel_stmt *stmt;
-	struct print_host_user_data *data;
-
-	isl_ast_print_options_free(print_options);
-
-	data = (struct print_host_user_data *) user;
-
-	id = isl_ast_node_get_annotation(node);
-	if (!id)
-		return print_device_node(p, node, data->prog);
-
-	is_user = !strcmp(isl_id_get_name(id), "user");
-	kernel = is_user ? NULL : isl_id_get_user(id);
-	stmt = is_user ? isl_id_get_user(id) : NULL;
-	isl_id_free(id);
-
-	if (is_user)
-		return ppcg_kernel_print_domain(p, stmt);
-
-	p = ppcg_start_block(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "dim3 k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimBlock");
-	p = print_reverse_list(p, kernel->n_block, kernel->block_dim);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	p = print_grid(p, kernel);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "kernel");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, " <<<k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimGrid, k");
-	p = isl_printer_print_int(p, kernel->id);
-	p = isl_printer_print_str(p, "_dimBlock>>> (");
-	p = print_kernel_arguments(p, data->prog, kernel, 0);
-	p = isl_printer_print_str(p, ");");
-	p = isl_printer_end_line(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cudaCheckKernel();");
-	p = isl_printer_end_line(p);
-
-	p = ppcg_end_block(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-
-#if 0
-	print_kernel(data->prog, kernel, data->cuda);
-#endif
-
-	return p;
-}
-
-static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
-	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-	struct cuda_info *cuda)
-{
-	isl_ast_print_options *print_options;
-	isl_ctx *ctx = isl_ast_node_get_ctx(tree);
-	struct print_host_user_data data = { cuda, prog };
-
-	print_options = isl_ast_print_options_alloc(ctx);
-	print_options = isl_ast_print_options_set_print_user(print_options,
-						&print_host_user, &data);
-
-	p = gpu_print_macros(p, tree);
-	p = isl_ast_node_print(tree, p, print_options);
-
-	return p;
-}
-
-/* Given a gpu_prog "prog" and the corresponding transformed AST
- * "tree", print the entire CUDA code to "p".
- * "types" collects the types for which a definition has already
- * been printed.
- */
-static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
-	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-	struct gpu_types *types, void *user)
-{
-	struct cuda_info *cuda = user;
-	isl_printer *kernel;
-
-	kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
-	kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
-	kernel = gpu_print_types(kernel, types, prog);
-	isl_printer_free(kernel);
-
-	if (!kernel)
-		return isl_printer_free(p);
-
-	p = print_host_code(p, prog, tree, cuda);
-
-	return p;
-}
-
-/* Transform the code in the file called "input" by replacing
- * all scops by corresponding CUDA code.
- * The names of the output files are derived from "input".
- *
- * We let generate_gpu do all the hard work and then let it call
- * us back for printing the AST in print_cuda.
- *
- * To prepare for this printing, we first open the output files
- * and we close them after generate_gpu has finished.
- */
-int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input)
-{
-	struct cuda_info cuda;
-	int r;
-
-	cuda_open_files(&cuda, input);
-
-	r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
-
-	cuda_close_files(&cuda);
-
-	return r;
-}
--- a/polly/lib/External/ppcg/cuda.h
+++ b/polly/lib/External/ppcg/cuda.h
@ -1,13 +0,0 @@
-#ifndef _CUDA_H
-#define _CUDA_H
-
-#include "ppcg_options.h"
-#include "ppcg.h"
-
-int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input);
-
-__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user);
-#endif
--- a/polly/lib/External/ppcg/cuda_common.c
+++ b/polly/lib/External/ppcg/cuda_common.c
@ -1,50 +0,0 @@
-/*
- * Copyright 2010      INRIA Saclay
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- */
-
-#include <ctype.h>
-#include <limits.h>
-#include <string.h>
-
-#include "cuda_common.h"
-#include "ppcg.h"
-
-/* Open the host .cu file and the kernel .hu and .cu files for writing.
- * Add the necessary includes.
- */
-void cuda_open_files(struct cuda_info *info, const char *input)
-{
-    char name[PATH_MAX];
-    int len;
-
-    len = ppcg_extract_base_name(name, input);
-
-    strcpy(name + len, "_host.cu");
-    info->host_c = fopen(name, "w");
-
-    strcpy(name + len, "_kernel.cu");
-    info->kernel_c = fopen(name, "w");
-
-    strcpy(name + len, "_kernel.hu");
-    info->kernel_h = fopen(name, "w");
-    fprintf(info->host_c, "#include <assert.h>\n");
-    fprintf(info->host_c, "#include <stdio.h>\n");
-    fprintf(info->host_c, "#include \"%s\"\n", name);
-    fprintf(info->kernel_c, "#include \"%s\"\n", name);
-    fprintf(info->kernel_h, "#include \"cuda.h\"\n\n");
-}
-
-/* Close all output files.
- */
-void cuda_close_files(struct cuda_info *info)
-{
-    fclose(info->kernel_c);
-    fclose(info->kernel_h);
-    fclose(info->host_c);
-}
--- a/polly/lib/External/ppcg/cuda_common.h
+++ b/polly/lib/External/ppcg/cuda_common.h
@ -1,15 +0,0 @@
-#ifndef _CUDA_COMMON_H_
-#define _CUDA_COMMON_H_
-
-#include <stdio.h>
-
-struct cuda_info {
-	FILE *host_c;
-	FILE *kernel_c;
-	FILE *kernel_h;
-};
-
-void cuda_open_files(struct cuda_info *info, const char *input);
-void cuda_close_files(struct cuda_info *info);
-
-#endif
--- a/polly/lib/External/ppcg/external.c
+++ b/polly/lib/External/ppcg/external.c
@ -1,192 +0,0 @@
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <pet.h>
-#include "cpu.h"
-#include "opencl.h"
-
-
-#define die() { \
-  fprintf(stderr, "Dummy function %s called\n", __FUNCTION__); \
-  abort(); \
-}
-
-__isl_give isl_union_map *pet_scop_compute_outer_to_any(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_compute_outer_to_inner(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-enum pet_tree_type pet_tree_get_type(__isl_keep pet_tree *tree) {
-  die();
-}
-int pet_tree_foreach_access_expr(__isl_keep pet_tree *tree,
-  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
-  die();
-}
-isl_ctx *pet_expr_get_ctx(__isl_keep pet_expr *expr) {
-  die();
-}
-isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr) {
-  die();
-}
-isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_read(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_tagged_may_write(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_must_write(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_multi_pw_aff *pet_expr_access_get_index(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_id *pet_expr_access_get_ref_id(__isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
-  struct ppcg_scop *ps, struct ppcg_options *options) {
-  die();
-}
-
-__isl_give isl_printer *pet_stmt_print_body(struct pet_stmt *stmt,
-  __isl_take isl_printer *p, __isl_keep isl_id_to_ast_expr *ref2expr) {
-  die();
-}
-unsigned pet_loc_get_start(__isl_keep pet_loc *loc) {
-  die();
-}
-unsigned pet_loc_get_end(__isl_keep pet_loc *loc) {
-  die();
-}
-int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
-  __isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
-    __isl_take pet_scop *scop, void *user), void *user) {
-  die();
-}
-__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
-  __isl_take isl_printer *p) {
-  die();
-}
-__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop) {
-  die();
-}
-__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop) {
-  die();
-}
-int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop) {
-  die();
-}
-int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop) {
-  die();
-}
-int pet_tree_foreach_expr(__isl_keep pet_tree *tree,
-  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
-  die();
-}
-int pet_expr_foreach_call_expr(__isl_keep pet_expr *expr,
-  int (*fn)(__isl_keep pet_expr *expr, void *user), void *user) {
-  die();
-}
-int pet_stmt_is_kill(struct pet_stmt *stmt) {
-  die();
-}
-struct isl_args pet_options_args;
-const char *ppcg_version(void) {
-  die();
-}
-int pet_options_set_encapsulate_dynamic_control(isl_ctx *ctx, int val) {
-  die();
-}
-int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
-  const char *input, const char *output) {
-  die();
-}
-int generate_cpu(isl_ctx *ctx, struct ppcg_options *options,
-  const char *input, const char *output) {
-  die();
-}
-__isl_give isl_id_to_ast_expr *pet_stmt_build_ast_exprs(struct pet_stmt *stmt,
-  __isl_keep isl_ast_build *build,
-  __isl_give isl_multi_pw_aff *(*fn_index)(
-    __isl_take isl_multi_pw_aff *mpa, __isl_keep isl_id *id,
-    void *user), void *user_index,
-  __isl_give isl_ast_expr *(*fn_expr)(__isl_take isl_ast_expr *expr,
-    __isl_keep isl_id *id, void *user), void *user_expr) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop) {
-  die();
-}
-__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
-  __isl_keep pet_scop *scop) {
-  die();
-}
-__isl_keep const char *pet_expr_call_get_name(__isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give pet_expr *pet_expr_call_set_name(__isl_take pet_expr *expr,
-  __isl_keep const char *name) {
-  die();
-}
-__isl_give pet_expr *pet_expr_get_arg(__isl_keep pet_expr *expr, int pos) {
-  die();
-}
-__isl_give pet_expr *pet_expr_new_cast(const char *type_name,
-  __isl_take pet_expr *arg) {
-  die();
-}
-__isl_give pet_expr *pet_expr_set_arg(__isl_take pet_expr *expr, int pos,
-  __isl_take pet_expr *arg) {
-  die();
-}
-__isl_give pet_tree *pet_tree_copy(__isl_keep pet_tree *tree) {
-  die();
-}
-__isl_null pet_tree *pet_tree_free(__isl_take pet_tree *tree) {
-  die();
-}
-__isl_give pet_tree *pet_tree_map_call_expr(__isl_take pet_tree *tree,
-  __isl_give pet_expr *(*fn)(__isl_take pet_expr *expr, void *user),
-  void *user) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_may_read(
-  __isl_keep pet_expr *expr) {
-  die();
-}
-__isl_give isl_union_map *pet_expr_access_get_may_write(
-  __isl_keep pet_expr *expr) {
-  die();
-}
--- a/polly/lib/External/ppcg/gpu.c
+++ b/polly/lib/External/ppcg/gpu.c
--- a/polly/lib/External/ppcg/gpu.h
+++ b/polly/lib/External/ppcg/gpu.h
@ -1,459 +0,0 @@
-#ifndef _GPU_H
-#define _GPU_H
-
-#include <isl/ast.h>
-#include <isl/id.h>
-#include <isl/id_to_ast_expr.h>
-
-#include <pet.h>
-
-#include "ppcg.h"
-#include "ppcg_options.h"
-
-/* An access to an outer array element or an iterator.
- * Accesses to iterators have an access relation that maps to an unnamed space.
- * An access may be both read and write.
- * If the access relation is empty, then the output dimension may
- * not be equal to the dimension of the corresponding array.
- */
-struct gpu_stmt_access {
-	/* Access reads elements */
-	int read;
-	/* Access writes elements */
-	int write;
-	/* All writes are definite writes. */
-	int exact_write;
-	/* Is a single, fixed element being accessed? */
-	isl_bool fixed_element;
-	/* The number of index expressions specified in the access. */
-	int n_index;
-
-	/* May access relation */
-	isl_map *access;
-	/* May access relation with as domain a mapping from iteration domain
-	 * to a reference identifier.
-	 */
-	isl_map *tagged_access;
-	/* The reference id of the corresponding pet_expr. */
-	isl_id *ref_id;
-
-	struct gpu_stmt_access *next;
-};
-
-/* A representation of a user statement.
- * "stmt" points to the corresponding pet statement.
- * "id" is the identifier of the instance set of the statement.
- * "accesses" is a linked list of accesses performed by the statement.
- * If the statement has been killed, i.e., if it will not be scheduled,
- * then this linked list may be empty even if the actual statement does
- * perform accesses.
- */
-struct gpu_stmt {
-	isl_id *id;
-	struct pet_stmt *stmt;
-
-	struct gpu_stmt_access *accesses;
-};
-
-/* Represents an outer array possibly accessed by a gpu_prog.
- */
-struct gpu_array_info {
-	/* The array data space. */
-	isl_space *space;
-	/* Element type. */
-	char *type;
-	/* Element size. */
-	int size;
-	/* Name of the array. */
-	char *name;
-	/* Declared extent of original array. */
-	isl_set *declared_extent;
-	/* AST expression for declared size of original array. */
-	isl_ast_expr *declared_size;
-	/* Extent of the array that needs to be copied. */
-	isl_set *extent;
-	/* Number of indices. */
-	unsigned n_index;
-	/* For each index, a bound on "extent" in that direction. */
-	isl_multi_pw_aff *bound;
-	/* The corresponding access AST expression, if the array needs
-	 * to be allocated on the device.
-	 */
-	isl_ast_expr *bound_expr;
-
-	/* All references to this array; point to elements of a linked list. */
-	int n_ref;
-	struct gpu_stmt_access **refs;
-
-	/* Is this array accessed at all by the program? */
-	int accessed;
-
-	/* Is this a scalar that is read-only within the entire program? */
-	int read_only_scalar;
-
-	/* Are the elements of the array structures? */
-	int has_compound_element;
-
-	/* Are the elements only accessed through constant index expressions? */
-	int only_fixed_element;
-
-	/* Is the array local to the scop? */
-	int local;
-	/* Is the array local and should it be declared on the host? */
-	int declare_local;
-
-	/* Is the corresponding global device memory accessed in any way? */
-	int global;
-
-	/* Should the array be linearized? */
-	int linearize;
-
-	/* Order dependences on this array.
-	 * Only used if live_range_reordering option is set.
-	 * It is set to NULL otherwise.
-	 */
-	isl_union_map *dep_order;
-
-    void *user;
-};
-
-/* Represents an outer array accessed by a ppcg_kernel, localized
- * to the context of this kernel.
- *
- * "array" points to the corresponding array in the gpu_prog.
- * The "n_group" "groups" are the reference groups associated to the array.
- * If "force_private" is set, then the array (in practice a scalar)
- * must be mapped to a register.
- * "global" is set if the global device memory corresponding
- * to this array is accessed by the kernel.
- * "bound" is equal to array->bound specialized to the current kernel.
- * "bound_expr" is the corresponding access AST expression.
- */
-struct gpu_local_array_info {
-	struct gpu_array_info *array;
-
-	int n_group;
-	struct gpu_array_ref_group **groups;
-
-	int force_private;
-	int global;
-
-	unsigned n_index;
-	isl_multi_pw_aff *bound;
-	isl_ast_expr *bound_expr;
-};
-
-__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
-	struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);
-
-/* A sequence of "n" names of types.
- */
-struct gpu_types {
-	int n;
-	char **name;
-};
-
-/* "read" and "write" contain the original access relations, possibly
- * involving member accesses.
- *
- * The elements of "array", as well as the ranges of "copy_in" and "copy_out"
- * only refer to the outer arrays of any possible member accesses.
- */
-struct gpu_prog {
-	isl_ctx *ctx;
-
-	struct ppcg_scop *scop;
-
-	/* Set of parameter values */
-	isl_set *context;
-
-	/* All potential read accesses in the entire program */
-	isl_union_map *read;
-
-	/* All potential write accesses in the entire program */
-	isl_union_map *may_write;
-	/* All definite write accesses in the entire program */
-	isl_union_map *must_write;
-	/* All tagged definite kills in the entire program */
-	isl_union_map *tagged_must_kill;
-
-	/* The set of inner array elements that may be preserved. */
-	isl_union_set *may_persist;
-
-	/* A mapping from all innermost arrays to their outer arrays. */
-	isl_union_map *to_outer;
-	/* A mapping from the outer arrays to all corresponding inner arrays. */
-	isl_union_map *to_inner;
-	/* A mapping from all intermediate arrays to their outer arrays,
-	 * including an identity mapping from the anonymous 1D space to itself.
-	 */
-	isl_union_map *any_to_outer;
-
-	/* Order dependences on non-scalars. */
-	isl_union_map *array_order;
-
-	/* Array of statements */
-	int n_stmts;
-	struct gpu_stmt *stmts;
-
-	int n_array;
-	struct gpu_array_info *array;
-};
-
-struct gpu_gen {
-	isl_ctx *ctx;
-	struct ppcg_options *options;
-
-	/* Callback for printing of AST in appropriate format. */
-	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
-		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-		struct gpu_types *types, void *user);
-	void *print_user;
-
-    isl_id_to_ast_expr *(*build_ast_expr)(void *stmt,
-            isl_ast_build *build,
-            isl_multi_pw_aff *(*fn_index)(
-                __isl_take isl_multi_pw_aff *mpa, isl_id *id,
-                void *user),
-            void *user_index,
-            isl_ast_expr *(*fn_expr)(isl_ast_expr *expr,
-                isl_id *id, void *user),
-        void *user_expr);
-
-	struct gpu_prog *prog;
-	/* The generated AST. */
-	isl_ast_node *tree;
-
-	/* The sequence of types for which a definition has been printed. */
-	struct gpu_types types;
-
-	/* User specified tile, grid and block sizes for each kernel */
-	isl_union_map *sizes;
-
-	/* Effectively used tile, grid and block sizes for each kernel */
-	isl_union_map *used_sizes;
-
-	/* Identifier of the next kernel. */
-	int kernel_id;
-};
-
-enum ppcg_group_access_type {
-	ppcg_access_global,
-	ppcg_access_shared,
-	ppcg_access_private
-};
-
-enum ppcg_kernel_stmt_type {
-	ppcg_kernel_copy,
-	ppcg_kernel_domain,
-	ppcg_kernel_sync
-};
-
-/* Representation of special statements, in particular copy statements
- * and __syncthreads statements, inside a kernel.
- *
- * type represents the kind of statement
- *
- *
- * for ppcg_kernel_copy statements we have
- *
- * read is set if the statement should copy data from global memory
- * to shared memory or registers.
- *
- * index expresses an access to the array element that needs to be copied
- * local_index expresses the corresponding element in the tile
- *
- * array refers to the original array being copied
- * local_array is a pointer to the appropriate element in the "array"
- *	array of the ppcg_kernel to which this copy access belongs
- *
- *
- * for ppcg_kernel_domain statements we have
- *
- * stmt is the corresponding input statement
- *
- * n_access is the number of accesses in stmt
- * access is an array of local information about the accesses
- */
-struct ppcg_kernel_stmt {
-	enum ppcg_kernel_stmt_type type;
-
-	union {
-		struct {
-			int read;
-			isl_ast_expr *index;
-			isl_ast_expr *local_index;
-			struct gpu_array_info *array;
-			struct gpu_local_array_info *local_array;
-		} c;
-		struct {
-			struct gpu_stmt *stmt;
-			isl_id_to_ast_expr *ref2expr;
-		} d;
-	} u;
-};
-
-/* Representation of a local variable in a kernel.
- */
-struct ppcg_kernel_var {
-	struct gpu_array_info *array;
-	enum ppcg_group_access_type type;
-	char *name;
-	isl_vec *size;
-};
-
-/* Representation of a kernel.
- *
- * prog describes the original code from which the kernel is extracted.
- *
- * id is the sequence number of the kernel.
- *
- * block_ids contains the list of block identifiers for this kernel.
- * thread_ids contains the list of thread identifiers for this kernel.
- *
- * the first n_grid elements of grid_dim represent the specified size
- * of the grid.
- * the first n_block elements of block_dim represent the specified or
- * effective size of the block.
- * Note that in the input file, the sizes of the grid and the blocks
- * are specified in the order x, y, z, but internally, the sizes
- * are stored in reverse order, so that the last element always
- * refers to the x dimension.
- *
- * grid_size reflects the effective grid size.
- * grid_size_expr contains a corresponding access AST expression, built within
- * the context where the launch appears.
- *
- * context contains the values of the parameters and outer schedule dimensions
- * for which any statement instance in this kernel needs to be executed.
- *
- * n_sync is the number of synchronization operations that have
- * been introduced in the schedule tree corresponding to this kernel (so far).
- *
- * core contains the spaces of the statement domains that form
- * the core computation of the kernel.  It is used to navigate
- * the tree during the construction of the device part of the schedule
- * tree in gpu_create_kernel.
- *
- * expanded_domain contains the original statement instances,
- * i.e., those that appear in the domains of access relations,
- * that are involved in the kernel.
- * contraction maps those original statement instances to
- * the statement instances that are active at the point
- * in the schedule tree where the kernel is created.
- *
- * arrays is the set of possibly accessed outer array elements.
- *
- * space is the schedule space of the AST context.  That is, it represents
- * the loops of the generated host code containing the kernel launch.
- *
- * n_array is the total number of arrays in the input program and also
- * the number of element in the array array.
- * array contains information about each array that is local
- * to the current kernel.  If an array is not used in a kernel,
- * then the corresponding entry does not contain any information.
- *
- * any_force_private is set if any array in the kernel is marked force_private
- *
- * block_filter contains constraints on the domain elements in the kernel
- * that encode the mapping to block identifiers, where the block identifiers
- * are represented by "n_grid" parameters with as names the elements
- * of "block_ids".
- *
- * thread_filter contains constraints on the domain elements in the kernel
- * that encode the mapping to thread identifiers, where the thread identifiers
- * are represented by "n_block" parameters with as names the elements
- * of "thread_ids".
- *
- * copy_schedule corresponds to the schedule dimensions of
- * the (tiled) schedule for this kernel that have been taken into account
- * for computing private/shared memory tiles.
- * The domain corresponds to the original statement instances, i.e.,
- * those that appear in the leaves of the schedule tree.
- * copy_schedule_dim is the dimension of this schedule.
- *
- * sync_writes contains write references that require synchronization.
- * Each reference is represented by a universe set in a space [S[i,j] -> R[]]
- * with S[i,j] the statement instance space and R[] the array reference.
- */
-struct ppcg_kernel {
-	isl_ctx *ctx;
-	struct ppcg_options *options;
-
-	struct gpu_prog *prog;
-
-	int id;
-
-	isl_id_list *block_ids;
-	isl_id_list *thread_ids;
-
-	int n_grid;
-	int n_block;
-	int grid_dim[2];
-	int block_dim[3];
-
-	isl_multi_pw_aff *grid_size;
-	isl_ast_expr *grid_size_expr;
-	isl_set *context;
-
-	int n_sync;
-	isl_union_set *core;
-	isl_union_set *arrays;
-
-	isl_union_pw_multi_aff *contraction;
-	isl_union_set *expanded_domain;
-
-	isl_space *space;
-
-	int n_array;
-	struct gpu_local_array_info *array;
-
-	int n_var;
-	struct ppcg_kernel_var *var;
-
-	int any_force_private;
-
-	isl_union_set *block_filter;
-	isl_union_set *thread_filter;
-	isl_union_pw_multi_aff *copy_schedule;
-	int copy_schedule_dim;
-
-	isl_union_set *sync_writes;
-
-	isl_ast_node *tree;
-};
-
-int gpu_array_is_scalar(struct gpu_array_info *array);
-int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
-int gpu_array_requires_device_allocation(struct gpu_array_info *array);
-__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
-isl_bool gpu_array_can_be_private(struct gpu_array_info *array);
-
-struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
-void *gpu_prog_free(struct gpu_prog *prog);
-
-int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);
-
-int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
-	struct ppcg_options *options,
-	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
-		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
-		struct gpu_types *types, void *user), void *user);
-
-__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node, int scale,
-	__isl_keep isl_multi_val *sizes);
-
-__isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
-int has_any_permutable_node(__isl_keep isl_schedule *schedule);
-__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
-                                       __isl_take isl_schedule *schedule,
-                                      int to_from_device);
-__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
-                                       __isl_take isl_schedule *schedule);
-
-__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
-void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
-void collect_order_dependences(struct gpu_prog *prog);
-isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
-#endif
--- a/polly/lib/External/ppcg/gpu_array_tile.c
+++ b/polly/lib/External/ppcg/gpu_array_tile.c
@ -1,71 +0,0 @@
-#include <isl/aff.h>
-#include <isl/map.h>
-
-#include "gpu_array_tile.h"
-
-struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile)
-{
-	int j;
-
-	if (!tile)
-		return NULL;
-
-	for (j = 0; j < tile->n; ++j) {
-		isl_val_free(tile->bound[j].size);
-		isl_val_free(tile->bound[j].stride);
-		isl_aff_free(tile->bound[j].lb);
-		isl_aff_free(tile->bound[j].shift);
-	}
-	free(tile->bound);
-	isl_multi_aff_free(tile->tiling);
-	free(tile);
-
-	return NULL;
-}
-
-/* Create a gpu_array_tile for an array of dimension "n_index".
- */
-struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index)
-{
-	int i;
-	struct gpu_array_tile *tile;
-
-	tile = isl_calloc_type(ctx, struct gpu_array_tile);
-	if (!tile)
-		return NULL;
-
-	tile->ctx = ctx;
-	tile->bound = isl_alloc_array(ctx, struct gpu_array_bound, n_index);
-	if (!tile->bound)
-		return gpu_array_tile_free(tile);
-
-	tile->n = n_index;
-
-	for (i = 0; i < n_index; ++i) {
-		tile->bound[i].size = NULL;
-		tile->bound[i].lb = NULL;
-		tile->bound[i].stride = NULL;
-		tile->bound[i].shift = NULL;
-	}
-
-	return tile;
-}
-
-/* Compute the size of the tile specified by "tile"
- * in number of elements and return the result.
- */
-__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile)
-{
-	int i;
-	isl_val *size;
-
-	if (!tile)
-		return NULL;
-
-	size = isl_val_one(tile->ctx);
-
-	for (i = 0; i < tile->n; ++i)
-		size = isl_val_mul(size, isl_val_copy(tile->bound[i].size));
-
-	return size;
-}
--- a/polly/lib/External/ppcg/gpu_array_tile.h
+++ b/polly/lib/External/ppcg/gpu_array_tile.h
@ -1,59 +0,0 @@
-#ifndef GPU_ARRAY_TILE_H
-#define GPU_ARRAY_TILE_H
-
-#include <isl/aff_type.h>
-#include <isl/map_type.h>
-#include <isl/val.h>
-
-/* The fields stride and shift only contain valid information
- * if shift != NULL.
- * If so, they express that current index is such that if you add shift,
- * then the result is always a multiple of stride.
- * Let D represent the initial tile->depth dimensions of the computed schedule.
- * The spaces of "lb" and "shift" are of the form
- *
- *	D -> [b]
- */
-struct gpu_array_bound {
-	isl_val *size;
-	isl_aff *lb;
-
-	isl_val *stride;
-	isl_aff *shift;
-};
-
-/* A tile of an outer array.
- *
- * requires_unroll is set if the schedule dimensions that are mapped
- * to threads need to be unrolled for this (private) tile to be used.
- *
- * "depth" reflects the number of schedule dimensions that affect the tile.
- * The copying into and/or out of the tile is performed at that depth.
- *
- * n is the dimension of the array.
- * bound is an array of size "n" representing the lower bound
- *	and size for each index.
- *
- * tiling maps a tile in the global array to the corresponding
- * shared/private memory tile and is of the form
- *
- *	{ [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
- *
- * where D represents the initial "depth" dimensions
- * of the computed schedule.
- */
-struct gpu_array_tile {
-	isl_ctx *ctx;
-	int requires_unroll;
-	int depth;
-	int n;
-	struct gpu_array_bound *bound;
-	isl_multi_aff *tiling;
-};
-
-struct gpu_array_tile *gpu_array_tile_create(isl_ctx *ctx, int n_index);
-struct gpu_array_tile *gpu_array_tile_free(struct gpu_array_tile *tile);
-
-__isl_give isl_val *gpu_array_tile_size(struct gpu_array_tile *tile);
-
-#endif
--- a/polly/lib/External/ppcg/gpu_group.c
+++ b/polly/lib/External/ppcg/gpu_group.c
--- a/polly/lib/External/ppcg/gpu_group.h
+++ b/polly/lib/External/ppcg/gpu_group.h
@ -1,65 +0,0 @@
-#ifndef GPU_GROUP_H
-#define GPU_GROUP_H
-
-#include <isl/schedule_node.h>
-#include "gpu.h"
-
-/* A group of array references in a kernel that should be handled together.
- * If private_tile is not NULL, then it is mapped to registers.
- * Otherwise, if shared_tile is not NULL, it is mapped to shared memory.
- * Otherwise, it is accessed from global memory.
- * Note that if both private_tile and shared_tile are set, then shared_tile
- * is only used inside group_common_shared_memory_tile.
- */
-struct gpu_array_ref_group {
-	/* The references in this group access this local array. */
-	struct gpu_local_array_info *local_array;
-	/* This is the corresponding array. */
-	struct gpu_array_info *array;
-	/* Position of this group in the list of reference groups of array. */
-	int nr;
-
-	/* The following fields are use during the construction of the groups.
-	 * access is the combined access relation relative to the private
-	 * memory tiling.  In particular, the domain of the map corresponds
-	 * to the first thread_depth dimensions of the kernel schedule.
-	 * write is set if any access in the group is a write.
-	 * exact_write is set if all writes are definite writes.
-	 * slice is set if there is at least one access in the group
-	 * that refers to more than one element
-	 * "min_depth" is the minimum of the tile depths and thread_depth.
-	 */
-	isl_map *access;
-	int write;
-	int exact_write;
-	int slice;
-	int min_depth;
-
-	/* The shared memory tile, NULL if none. */
-	struct gpu_array_tile *shared_tile;
-
-	/* The private memory tile, NULL if none. */
-	struct gpu_array_tile *private_tile;
-
-	/* References in this group; point to elements of a linked list. */
-	int n_ref;
-	struct gpu_stmt_access **refs;
-};
-
-int gpu_group_references(struct ppcg_kernel *kernel,
-	__isl_keep isl_schedule_node *node);
-
-__isl_give isl_printer *gpu_array_ref_group_print_name(
-	struct gpu_array_ref_group *group, __isl_take isl_printer *p);
-void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group);
-__isl_give isl_union_map *gpu_array_ref_group_access_relation(
-	struct gpu_array_ref_group *group, int read, int write);
-int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group);
-enum ppcg_group_access_type gpu_array_ref_group_type(
-	struct gpu_array_ref_group *group);
-struct gpu_array_tile *gpu_array_ref_group_tile(
-	struct gpu_array_ref_group *group);
-struct gpu_array_ref_group *gpu_array_ref_group_free(
-	struct gpu_array_ref_group *group);
-
-#endif
--- a/polly/lib/External/ppcg/gpu_hybrid.c
+++ b/polly/lib/External/ppcg/gpu_hybrid.c
@ -1,146 +0,0 @@
-/*
- * Copyright 2013      Ecole Normale Superieure
- * Copyright 2015      Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <string.h>
-
-#include <isl/val.h>
-#include <isl/space.h>
-#include <isl/union_set.h>
-#include <isl/schedule_node.h>
-
-#include "hybrid.h"
-#include "gpu_hybrid.h"
-#include "gpu_tree.h"
-#include "schedule.h"
-#include "util.h"
-
-/* Have all domain elements been filtered out before reaching
- * the "node" position in the schedule tree?
- */
-static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node)
-{
-	isl_union_set *domain;
-	isl_bool empty;
-
-	domain = isl_schedule_node_get_domain(node);
-	empty = isl_union_set_is_empty(domain);
-	isl_union_set_free(domain);
-
-	return empty;
-}
-
-/* Given a pointer to a phase in the result of hybrid tiling,
- * map the phase to the device, provided the phase is non-empty.
- * Empty phases can occur if the input schedule domain can be
- * covered by a small number of hexagons that all belong to the same phase.
- *
- * The input has the following form:
- *
- *	M - CT - P - C - ...
- *
- * with M the phase marker, CT the space tiling, P the original
- * parent band and C the original child band.
- * The (outer dimensions of the) C band need to be mapped to threads.
- * The (outer dimension of the) CT band needs to be mapped to blocks.
- * The mapping to shared memory needs to be computed between the CT and
- * the P band.
- *
- * The C band is first shifted to start at zero.
- * Then the appropriate markers are introduced and a kernel is
- * created for the tree rooted at CT.
- * If the "unroll_gpu_tile" option is set, then the AST generator
- * is instructed to unroll the P and C bands.
- */
-static __isl_give isl_schedule_node *update_phase(
-	__isl_take isl_schedule_node *node, void *user)
-{
-	struct gpu_gen *gen = user;
-	int depth0, depth;
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_bool empty_domain;
-	ppcg_ht_phase *phase;
-
-	empty_domain = has_empty_domain(node);
-	if (empty_domain < 0)
-		return isl_schedule_node_free(node);
-	if (empty_domain)
-		return node;
-
-	if (!node)
-		return NULL;
-	ctx = isl_schedule_node_get_ctx(node);
-
-	phase = ppcg_ht_phase_extract_from_mark(node);
-
-	depth0 = isl_schedule_node_get_tree_depth(node);
-
-	node = isl_schedule_node_child(node, 0);
-
-	node = isl_schedule_node_child(node, 0);
-	node = isl_schedule_node_child(node, 0);
-	node = ppcg_ht_phase_shift_space_point(phase, node);
-	if (gen->options->unroll_gpu_tile)
-		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
-	id = isl_id_alloc(ctx, "thread", NULL);
-	node = isl_schedule_node_insert_mark(node, id);
-	node = isl_schedule_node_parent(node);
-	if (gen->options->unroll_gpu_tile)
-		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
-	id = isl_id_alloc(ctx, "shared", NULL);
-	node = isl_schedule_node_insert_mark(node, id);
-	node = isl_schedule_node_parent(node);
-
-	node = gpu_create_kernel(gen, node, 0, NULL);
-
-	depth = isl_schedule_node_get_tree_depth(node);
-	node = isl_schedule_node_ancestor(node, depth - depth0);
-
-	return node;
-}
-
-/* Apply hybrid tiling on "node" and its parent based on the (valid)
- * bounds on the relative dependence distances "bounds" and
- * the tile sizes in "tile_sizes".
- * The number of elements in "tile_sizes" is at least as large
- * as the sum of the dimensions of the parent and the child node.
- *
- * Convert the tile_sizes to an isl_multi_val in the right space,
- * insert the hybrid tiling and then create a kernel inside each phase.
- * Finally, remove the phase marks.
- */
-__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
-	int *tile_sizes)
-{
-	isl_multi_val *mv;
-	isl_space *space, *space2;
-
-	if (!node || !bounds)
-		goto error;
-
-	space2 = isl_schedule_node_band_get_space(node);
-	node = isl_schedule_node_parent(node);
-	space = isl_schedule_node_band_get_space(node);
-	space = isl_space_product(space, space2);
-	mv = ppcg_multi_val_from_int_list(space, tile_sizes);
-
-	node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options);
-
-	node = hybrid_tile_foreach_phase(node, &update_phase, gen);
-
-	node = hybrid_tile_drop_phase_marks(node);
-
-	return node;
-error:
-	isl_schedule_node_free(node);
-	ppcg_ht_bounds_free(bounds);
-	return NULL;
-}
--- a/polly/lib/External/ppcg/gpu_hybrid.h
+++ b/polly/lib/External/ppcg/gpu_hybrid.h
@ -1,13 +0,0 @@
-#ifndef GPU_HYBRID_H
-#define GPU_HYBRID_H
-
-#include <isl/schedule_node.h>
-
-#include "gpu.h"
-#include "hybrid.h"
-
-__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
-	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
-	int *tile_sizes);
-
-#endif
--- a/polly/lib/External/ppcg/gpu_print.c
+++ b/polly/lib/External/ppcg/gpu_print.c
@ -1,310 +0,0 @@
-/*
- * Copyright 2012      Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
- */
-
-#include <string.h>
-
-#include <isl/aff.h>
-
-#include "gpu_print.h"
-#include "print.h"
-#include "schedule.h"
-
-/* Print declarations to "p" for arrays that are local to "prog"
- * but that are used on the host and therefore require a declaration.
- */
-__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	int i;
-
-	if (!prog)
-		return isl_printer_free(p);
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-		isl_ast_expr *size;
-
-		if (!array->declare_local)
-			continue;
-		size = array->declared_size;
-		p = ppcg_print_declaration_with_size(p, array->type, size);
-	}
-
-	return p;
-}
-
-/* Print an expression for the size of "array" in bytes.
- */
-__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
-	struct gpu_array_info *array)
-{
-	int i;
-
-	for (i = 0; i < array->n_index; ++i) {
-		isl_ast_expr *bound;
-
-		prn = isl_printer_print_str(prn, "(");
-		bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
-		prn = isl_printer_print_ast_expr(prn, bound);
-		isl_ast_expr_free(bound);
-		prn = isl_printer_print_str(prn, ") * ");
-	}
-	prn = isl_printer_print_str(prn, "sizeof(");
-	prn = isl_printer_print_str(prn, array->type);
-	prn = isl_printer_print_str(prn, ")");
-
-	return prn;
-}
-
-/* Print the declaration of a non-linearized array argument.
- */
-static __isl_give isl_printer *print_non_linearized_declaration_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array)
-{
-	p = isl_printer_print_str(p, array->type);
-	p = isl_printer_print_str(p, " ");
-
-	p = isl_printer_print_ast_expr(p, array->bound_expr);
-
-	return p;
-}
-
-/* Print the declaration of an array argument.
- * "memory_space" allows to specify a memory space prefix.
- */
-__isl_give isl_printer *gpu_array_info_print_declaration_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array,
-	const char *memory_space)
-{
-	if (gpu_array_is_read_only_scalar(array)) {
-		p = isl_printer_print_str(p, array->type);
-		p = isl_printer_print_str(p, " ");
-		p = isl_printer_print_str(p, array->name);
-		return p;
-	}
-
-	if (memory_space) {
-		p = isl_printer_print_str(p, memory_space);
-		p = isl_printer_print_str(p, " ");
-	}
-
-	if (array->n_index != 0 && !array->linearize)
-		return print_non_linearized_declaration_argument(p, array);
-
-	p = isl_printer_print_str(p, array->type);
-	p = isl_printer_print_str(p, " ");
-	p = isl_printer_print_str(p, "*");
-	p = isl_printer_print_str(p, array->name);
-
-	return p;
-}
-
-/* Print the call of an array argument.
- */
-__isl_give isl_printer *gpu_array_info_print_call_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array)
-{
-	if (gpu_array_is_read_only_scalar(array))
-		return isl_printer_print_str(p, array->name);
-
-	p = isl_printer_print_str(p, "dev_");
-	p = isl_printer_print_str(p, array->name);
-
-	return p;
-}
-
-/* Print an access to the element in the private/shared memory copy
- * described by "stmt".  The index of the copy is recorded in
- * stmt->local_index as an access to the array.
- */
-static __isl_give isl_printer *stmt_print_local_index(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	return isl_printer_print_ast_expr(p, stmt->u.c.local_index);
-}
-
-/* Print an access to the element in the global memory copy
- * described by "stmt".  The index of the copy is recorded in
- * stmt->index as an access to the array.
- */
-static __isl_give isl_printer *stmt_print_global_index(
-	__isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
-{
-	struct gpu_array_info *array = stmt->u.c.array;
-	isl_ast_expr *index;
-
-	if (gpu_array_is_scalar(array)) {
-		if (!gpu_array_is_read_only_scalar(array))
-			p = isl_printer_print_str(p, "*");
-		p = isl_printer_print_str(p, array->name);
-		return p;
-	}
-
-	index = isl_ast_expr_copy(stmt->u.c.index);
-
-	p = isl_printer_print_ast_expr(p, index);
-	isl_ast_expr_free(index);
-
-	return p;
-}
-
-/* Print a copy statement.
- *
- * A read copy statement is printed as
- *
- *	local = global;
- *
- * while a write copy statement is printed as
- *
- *	global = local;
- */
-__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	p = isl_printer_start_line(p);
-	if (stmt->u.c.read) {
-		p = stmt_print_local_index(p, stmt);
-		p = isl_printer_print_str(p, " = ");
-		p = stmt_print_global_index(p, stmt);
-	} else {
-		p = stmt_print_global_index(p, stmt);
-		p = isl_printer_print_str(p, " = ");
-		p = stmt_print_local_index(p, stmt);
-	}
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt)
-{
-	return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);
-}
-
-/* This function is called for each node in a GPU AST.
- * In case of a user node, print the macro definitions required
- * for printing the AST expressions in the annotation, if any.
- * For other nodes, return true such that descendants are also
- * visited.
- *
- * In particular, for a kernel launch, print the macro definitions
- * needed for the grid size.
- * For a copy statement, print the macro definitions needed
- * for the two index expressions.
- * For an original user statement, print the macro definitions
- * needed for the substitutions.
- */
-static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
-{
-	const char *name;
-	isl_id *id;
-	int is_kernel;
-	struct ppcg_kernel *kernel;
-	struct ppcg_kernel_stmt *stmt;
-	isl_printer **p = user;
-
-	if (isl_ast_node_get_type(node) != isl_ast_node_user)
-		return isl_bool_true;
-
-	id = isl_ast_node_get_annotation(node);
-	if (!id)
-		return isl_bool_false;
-
-	name = isl_id_get_name(id);
-	if (!name)
-		return isl_bool_error;
-	is_kernel = !strcmp(name, "kernel");
-	kernel = is_kernel ? isl_id_get_user(id) : NULL;
-	stmt = is_kernel ? NULL : isl_id_get_user(id);
-	isl_id_free(id);
-
-	if ((is_kernel && !kernel) || (!is_kernel && !stmt))
-		return isl_bool_error;
-
-	if (is_kernel) {
-		*p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p);
-	} else if (stmt->type == ppcg_kernel_copy) {
-		*p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p);
-		*p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p);
-	} else if (stmt->type == ppcg_kernel_domain) {
-		*p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr);
-	}
-	if (!*p)
-		return isl_bool_error;
-
-	return isl_bool_false;
-}
-
-/* Print the required macros for the GPU AST "node" to "p",
- * including those needed for the user statements inside the AST.
- */
-__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node)
-{
-	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
-		return isl_printer_free(p);
-	p = ppcg_print_macros(p, node);
-	return p;
-}
-
-/* Was the definition of "type" printed before?
- * That is, does its name appear in the list of printed types "types"?
- */
-static int already_printed(struct gpu_types *types,
-	struct pet_type *type)
-{
-	int i;
-
-	for (i = 0; i < types->n; ++i)
-		if (!strcmp(types->name[i], type->name))
-			return 1;
-
-	return 0;
-}
-
-/* Print the definitions of all types prog->scop that have not been
- * printed before (according to "types") on "p".
- * Extend the list of printed types "types" with the newly printed types.
- */
-__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
-	struct gpu_types *types, struct gpu_prog *prog)
-{
-	int i, n;
-	isl_ctx *ctx;
-	char **name;
-
-	n = prog->scop->pet->n_type;
-
-	if (n == 0)
-		return p;
-
-	ctx = isl_printer_get_ctx(p);
-	name = isl_realloc_array(ctx, types->name, char *, types->n + n);
-	if (!name)
-		return isl_printer_free(p);
-	types->name = name;
-
-	for (i = 0; i < n; ++i) {
-		struct pet_type *type = prog->scop->pet->types[i];
-
-		if (already_printed(types, type))
-			continue;
-
-		p = isl_printer_start_line(p);
-		p = isl_printer_print_str(p, type->definition);
-		p = isl_printer_print_str(p, ";");
-		p = isl_printer_end_line(p);
-
-		types->name[types->n++] = strdup(type->name);
-	}
-
-	return p;
-}
--- a/polly/lib/External/ppcg/gpu_print.h
+++ b/polly/lib/External/ppcg/gpu_print.h
@ -1,28 +0,0 @@
-#ifndef GPU_PRINT_H
-#define GPU_PRINT_H
-
-#include "gpu.h"
-
-__isl_give isl_printer *gpu_print_local_declarations(__isl_take isl_printer *p,
-	struct gpu_prog *prog);
-
-__isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
-	struct gpu_types *types, struct gpu_prog *prog);
-
-__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node);
-
-__isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
-	struct gpu_array_info *array);
-__isl_give isl_printer *gpu_array_info_print_declaration_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array,
-	const char *memory_space);
-__isl_give isl_printer *gpu_array_info_print_call_argument(
-	__isl_take isl_printer *p, struct gpu_array_info *array);
-
-__isl_give isl_printer *ppcg_kernel_print_copy(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt);
-__isl_give isl_printer *ppcg_kernel_print_domain(__isl_take isl_printer *p,
-	struct ppcg_kernel_stmt *stmt);
-
-#endif
--- a/polly/lib/External/ppcg/gpu_tree.c
+++ b/polly/lib/External/ppcg/gpu_tree.c
@ -1,640 +0,0 @@
-/*
- * Copyright 2013      Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <string.h>
-
-#include <isl/set.h>
-#include <isl/union_set.h>
-#include <isl/space.h>
-
-#include "gpu_tree.h"
-
-/* The functions in this file are used to navigate part of a schedule tree
- * that is mapped to blocks.  Initially, this part consists of a linear
- * branch segment with a mark node with name "kernel" on the outer end
- * and a mark node with name "thread" on the inner end.
- * During the mapping to blocks, branching may be introduced, but only
- * one of the elements in each sequence contains the "thread" mark.
- * The filter of this element (and only this filter) contains
- * domain elements identified by the "core" argument of the functions
- * that move down this tree.
- *
- * Synchronization statements have a name that starts with "sync" and
- * a user pointer pointing to the kernel that contains the synchronization.
- * The functions inserting or detecting synchronizations take a ppcg_kernel
- * argument to be able to create or identify such statements.
- * They may also use two fields in this structure, the "core" field
- * to move around in the tree and the "n_sync" field to make sure that
- * each synchronization has a different name (within the kernel).
- */
-
-/* Is "node" a mark node with an identifier called "name"?
- */
-static int is_marked(__isl_keep isl_schedule_node *node, const char *name)
-{
-	isl_id *mark;
-	int has_name;
-
-	if (!node)
-		return -1;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
-		return 0;
-
-	mark = isl_schedule_node_mark_get_id(node);
-	if (!mark)
-		return -1;
-
-	has_name = !strcmp(isl_id_get_name(mark), name);
-	isl_id_free(mark);
-
-	return has_name;
-}
-
-/* Is "node" a mark node with an identifier called "kernel"?
- */
-int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node)
-{
-	return is_marked(node, "kernel");
-}
-
-/* Is "node" a mark node with an identifier called "shared"?
- */
-static int node_is_shared(__isl_keep isl_schedule_node *node)
-{
-	return is_marked(node, "shared");
-}
-
-/* Is "node" a mark node with an identifier called "thread"?
- */
-static int node_is_thread(__isl_keep isl_schedule_node *node)
-{
-	return is_marked(node, "thread");
-}
-
-/* Insert a mark node with identifier "shared" in front of "node".
- */
-static __isl_give isl_schedule_node *insert_shared(
-	__isl_take isl_schedule_node *node)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-
-	ctx = isl_schedule_node_get_ctx(node);
-	id = isl_id_alloc(ctx, "shared", NULL);
-	node = isl_schedule_node_insert_mark(node, id);
-
-	return node;
-}
-
-/* Insert a "shared" mark in front of the "thread" mark
- * provided the linear branch between "node" and the "thread" mark
- * does not contain such a "shared" mark already.
- *
- * As a side effect, this function checks that the subtree at "node"
- * actually contains a "thread" mark and that there is no branching
- * in between "node" and this "thread" mark.
- */
-__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
-	__isl_take isl_schedule_node *node)
-{
-	int depth0, depth;
-	int any_shared = 0;
-
-	if (!node)
-		return NULL;
-
-	depth0 = isl_schedule_node_get_tree_depth(node);
-
-	for (;;) {
-		int is_thread;
-		int n;
-
-		if (!any_shared) {
-			any_shared = node_is_shared(node);
-			if (any_shared < 0)
-				return isl_schedule_node_free(node);
-		}
-		is_thread = node_is_thread(node);
-		if (is_thread < 0)
-			return isl_schedule_node_free(node);
-		if (is_thread)
-			break;
-		n = isl_schedule_node_n_children(node);
-		if (n == 0)
-			isl_die(isl_schedule_node_get_ctx(node),
-				isl_error_invalid,
-				"no thread marker found",
-				return isl_schedule_node_free(node));
-		if (n > 1)
-			isl_die(isl_schedule_node_get_ctx(node),
-				isl_error_invalid,
-				"expecting single thread marker",
-				return isl_schedule_node_free(node));
-
-		node = isl_schedule_node_child(node, 0);
-	}
-
-	if (!any_shared)
-		node = insert_shared(node);
-	depth = isl_schedule_node_get_tree_depth(node);
-	node = isl_schedule_node_ancestor(node, depth - depth0);
-
-	return node;
-}
-
-/* Assuming "node" is a filter node, does it correspond to the branch
- * that contains the "thread" mark, i.e., does it contain any elements
- * in "core"?
- */
-static int node_is_core(__isl_keep isl_schedule_node *node,
-	__isl_keep isl_union_set *core)
-{
-	int disjoint;
-	isl_union_set *filter;
-
-	filter = isl_schedule_node_filter_get_filter(node);
-	disjoint = isl_union_set_is_disjoint(filter, core);
-	isl_union_set_free(filter);
-	if (disjoint < 0)
-		return -1;
-
-	return !disjoint;
-}
-
-/* Move to the only child of "node" that has the "thread" mark as descendant,
- * where the branch containing this mark is identified by the domain elements
- * in "core".
- *
- * If "node" is not a sequence, then it only has one child and we move
- * to that single child.
- * Otherwise, we check each of the filters in the children, pick
- * the one that corresponds to "core" and return a pointer to the child
- * of the filter node.
- */
-static __isl_give isl_schedule_node *core_child(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
-{
-	int i, n;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
-		return isl_schedule_node_child(node, 0);
-
-	n = isl_schedule_node_n_children(node);
-	for (i = 0; i < n; ++i) {
-		int is_core;
-
-		node = isl_schedule_node_child(node, i);
-		is_core = node_is_core(node, core);
-
-		if (is_core < 0)
-			return isl_schedule_node_free(node);
-		if (is_core)
-			return isl_schedule_node_child(node, 0);
-
-		node = isl_schedule_node_parent(node);
-	}
-
-	isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
-		"core child not found", return isl_schedule_node_free(node));
-}
-
-/* Move down the branch between "kernel" and "thread" until
- * the "shared" mark is reached, where the branch containing the "shared"
- * mark is identified by the domain elements in "core".
- */
-__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
-{
-	int is_shared;
-
-	while ((is_shared = node_is_shared(node)) == 0)
-		node = core_child(node, core);
-	if (is_shared < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move down the branch between "kernel" and "thread" until
- * the "thread" mark is reached, where the branch containing the "thread"
- * mark is identified by the domain elements in "core".
- */
-__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
-{
-	int is_thread;
-
-	while ((is_thread = node_is_thread(node)) == 0)
-		node = core_child(node, core);
-	if (is_thread < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move up the tree underneath the "thread" mark until
- * the "thread" mark is reached.
- */
-__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
-	__isl_take isl_schedule_node *node)
-{
-	int is_thread;
-
-	while ((is_thread = node_is_thread(node)) == 0)
-		node = isl_schedule_node_parent(node);
-	if (is_thread < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move up the tree underneath the "kernel" mark until
- * the "kernel" mark is reached.
- */
-__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
-	__isl_take isl_schedule_node *node)
-{
-	int is_kernel;
-
-	while ((is_kernel = gpu_tree_node_is_kernel(node)) == 0)
-		node = isl_schedule_node_parent(node);
-	if (is_kernel < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Move down from the "kernel" mark (or at least a node with schedule
- * depth smaller than or equal to "depth") to a band node at schedule
- * depth "depth".  The "thread" mark is assumed to have a schedule
- * depth greater than or equal to "depth".  The branch containing the
- * "thread" mark is identified by the domain elements in "core".
- *
- * If the desired schedule depth is in the middle of band node,
- * then the band node is split into two pieces, the second piece
- * at the desired schedule depth.
- */
-__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
-	__isl_take isl_schedule_node *node, int depth,
-	__isl_keep isl_union_set *core)
-{
-	int is_shared;
-	int is_thread = 0;
-
-	while (node && isl_schedule_node_get_schedule_depth(node) < depth) {
-		if (isl_schedule_node_get_type(node) ==
-						    isl_schedule_node_band) {
-			int node_depth, node_dim;
-			node_depth = isl_schedule_node_get_schedule_depth(node);
-			node_dim = isl_schedule_node_band_n_member(node);
-			if (node_depth + node_dim > depth)
-				node = isl_schedule_node_band_split(node,
-							depth - node_depth);
-		}
-		node = core_child(node, core);
-	}
-	while ((is_shared = node_is_shared(node)) == 0 &&
-	    (is_thread = node_is_thread(node)) == 0 &&
-	    isl_schedule_node_get_type(node) != isl_schedule_node_band)
-		node = core_child(node, core);
-	if (is_shared < 0 || is_thread < 0)
-		node = isl_schedule_node_free(node);
-
-	return node;
-}
-
-/* Create a union set containing a single set with a tuple identifier
- * called "syncX" and user pointer equal to "kernel".
- */
-static __isl_give isl_union_set *create_sync_domain(struct ppcg_kernel *kernel)
-{
-	isl_space *space;
-	isl_id *id;
-	char name[40];
-
-	space = isl_space_set_alloc(kernel->ctx, 0, 0);
-	snprintf(name, sizeof(name), "sync%d", kernel->n_sync++);
-	id = isl_id_alloc(kernel->ctx, name, kernel);
-	space = isl_space_set_tuple_id(space, isl_dim_set, id);
-	return isl_union_set_from_set(isl_set_universe(space));
-}
-
-/* Is "id" the identifier of a synchronization statement inside "kernel"?
- * That is, does its name start with "sync" and does it point to "kernel"?
- */
-int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel)
-{
-	const char *name;
-
-	name = isl_id_get_name(id);
-	if (!name)
-		return 0;
-	else if (strncmp(name, "sync", 4))
-		return 0;
-	return isl_id_get_user(id) == kernel;
-}
-
-/* Does "domain" consist of a single set with a tuple identifier
- * corresponding to a synchronization for "kernel"?
- */
-static int domain_is_sync(__isl_keep isl_union_set *domain,
-	struct ppcg_kernel *kernel)
-{
-	int is_sync;
-	isl_id *id;
-	isl_set *set;
-
-	if (isl_union_set_n_set(domain) != 1)
-		return 0;
-	set = isl_set_from_union_set(isl_union_set_copy(domain));
-	id = isl_set_get_tuple_id(set);
-	is_sync = gpu_tree_id_is_sync(id, kernel);
-	isl_id_free(id);
-	isl_set_free(set);
-
-	return is_sync;
-}
-
-/* Does "node" point to a filter selecting a synchronization statement
- * for "kernel"?
- */
-static int node_is_sync_filter(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int is_sync;
-	enum isl_schedule_node_type type;
-	isl_union_set *domain;
-
-	if (!node)
-		return -1;
-	type = isl_schedule_node_get_type(node);
-	if (type != isl_schedule_node_filter)
-		return 0;
-	domain = isl_schedule_node_filter_get_filter(node);
-	is_sync = domain_is_sync(domain, kernel);
-	isl_union_set_free(domain);
-
-	return is_sync;
-}
-
-/* Is "node" part of a sequence with a previous synchronization statement
- * for "kernel"?
- * That is, is the parent of "node" a filter such that there is
- * a previous filter that picks out exactly such a synchronization statement?
- */
-static int has_preceding_sync(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int found = 0;
-
-	node = isl_schedule_node_copy(node);
-	node = isl_schedule_node_parent(node);
-	while (!found && isl_schedule_node_has_previous_sibling(node)) {
-		node = isl_schedule_node_previous_sibling(node);
-		if (!node)
-			break;
-		found = node_is_sync_filter(node, kernel);
-	}
-	if (!node)
-		found = -1;
-	isl_schedule_node_free(node);
-
-	return found;
-}
-
-/* Is "node" part of a sequence with a subsequent synchronization statement
- * for "kernel"?
- * That is, is the parent of "node" a filter such that there is
- * a subsequent filter that picks out exactly such a synchronization statement?
- */
-static int has_following_sync(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int found = 0;
-
-	node = isl_schedule_node_copy(node);
-	node = isl_schedule_node_parent(node);
-	while (!found && isl_schedule_node_has_next_sibling(node)) {
-		node = isl_schedule_node_next_sibling(node);
-		if (!node)
-			break;
-		found = node_is_sync_filter(node, kernel);
-	}
-	if (!node)
-		found = -1;
-	isl_schedule_node_free(node);
-
-	return found;
-}
-
-/* Does the subtree rooted at "node" (which is a band node) contain
- * any synchronization statement for "kernel" that precedes
- * the core computation of "kernel" (identified by the elements
- * in kernel->core)?
- */
-static int has_sync_before_core(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int has_sync = 0;
-	int is_thread;
-
-	node = isl_schedule_node_copy(node);
-	while ((is_thread = node_is_thread(node)) == 0) {
-		node = core_child(node, kernel->core);
-		has_sync = has_preceding_sync(node, kernel);
-		if (has_sync < 0 || has_sync)
-			break;
-	}
-	if (is_thread < 0 || !node)
-		has_sync = -1;
-	isl_schedule_node_free(node);
-
-	return has_sync;
-}
-
-/* Does the subtree rooted at "node" (which is a band node) contain
- * any synchronization statement for "kernel" that follows
- * the core computation of "kernel" (identified by the elements
- * in kernel->core)?
- */
-static int has_sync_after_core(__isl_keep isl_schedule_node *node,
-	struct ppcg_kernel *kernel)
-{
-	int has_sync = 0;
-	int is_thread;
-
-	node = isl_schedule_node_copy(node);
-	while ((is_thread = node_is_thread(node)) == 0) {
-		node = core_child(node, kernel->core);
-		has_sync = has_following_sync(node, kernel);
-		if (has_sync < 0 || has_sync)
-			break;
-	}
-	if (is_thread < 0 || !node)
-		has_sync = -1;
-	isl_schedule_node_free(node);
-
-	return has_sync;
-}
-
-/* Insert (or extend) an extension on top of "node" that puts
- * a synchronization node for "kernel" before "node".
- * Return a pointer to the original node in the updated schedule tree.
- */
-static __isl_give isl_schedule_node *insert_sync_before(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	isl_union_set *domain;
-	isl_schedule_node *graft;
-
-	if (!node)
-		return NULL;
-
-	domain = create_sync_domain(kernel);
-	graft = isl_schedule_node_from_domain(domain);
-	node = isl_schedule_node_graft_before(node, graft);
-
-	return node;
-}
-
-/* Insert (or extend) an extension on top of "node" that puts
- * a synchronization node for "kernel" afater "node".
- * Return a pointer to the original node in the updated schedule tree.
- */
-static __isl_give isl_schedule_node *insert_sync_after(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	isl_union_set *domain;
-	isl_schedule_node *graft;
-
-	if (!node)
-		return NULL;
-
-	domain = create_sync_domain(kernel);
-	graft = isl_schedule_node_from_domain(domain);
-	node = isl_schedule_node_graft_after(node, graft);
-
-	return node;
-}
-
-/* Insert an extension on top of "node" that puts a synchronization node
- * for "kernel" before "node" unless there already is
- * such a synchronization node.
- */
-__isl_give isl_schedule_node *gpu_tree_ensure_preceding_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-
-	has_sync = has_preceding_sync(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	return insert_sync_before(node, kernel);
-}
-
-/* Insert an extension on top of "node" that puts a synchronization node
- * for "kernel" after "node" unless there already is
- * such a synchronization node.
- */
-__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-
-	has_sync = has_following_sync(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	return insert_sync_after(node, kernel);
-}
-
-/* Insert an extension on top of "node" that puts a synchronization node
- * for "kernel" after "node" unless there already is such a sync node or
- * "node" itself already * contains a synchronization node following
- * the core computation of "kernel".
- */
-__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-
-	has_sync = has_sync_after_core(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	has_sync = has_following_sync(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	return insert_sync_after(node, kernel);
-}
-
-/* Move left in the sequence on top of "node" to a synchronization node
- * for "kernel".
- * If "node" itself contains a synchronization node preceding
- * the core computation of "kernel", then return "node" itself.
- * Otherwise, if "node" does not have a preceding synchronization node,
- * then create one first.
- */
-__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-	int is_sync;
-
-	has_sync = has_sync_before_core(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	node = gpu_tree_ensure_preceding_sync(node, kernel);
-	node = isl_schedule_node_parent(node);
-	while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
-		node = isl_schedule_node_previous_sibling(node);
-	if (is_sync < 0)
-		node = isl_schedule_node_free(node);
-	node = isl_schedule_node_child(node, 0);
-
-	return node;
-}
-
-/* Move right in the sequence on top of "node" to a synchronization node
- * for "kernel".
- * If "node" itself contains a synchronization node following
- * the core computation of "kernel", then return "node" itself.
- * Otherwise, if "node" does not have a following synchronization node,
- * then create one first.
- */
-__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel)
-{
-	int has_sync;
-	int is_sync;
-
-	has_sync = has_sync_after_core(node, kernel);
-	if (has_sync < 0)
-		return isl_schedule_node_free(node);
-	if (has_sync)
-		return node;
-	node = gpu_tree_ensure_following_sync(node, kernel);
-	node = isl_schedule_node_parent(node);
-	while ((is_sync = node_is_sync_filter(node, kernel)) == 0)
-		node = isl_schedule_node_next_sibling(node);
-	if (is_sync < 0)
-		node = isl_schedule_node_free(node);
-	node = isl_schedule_node_child(node, 0);
-
-	return node;
-}
--- a/polly/lib/External/ppcg/gpu_tree.h
+++ b/polly/lib/External/ppcg/gpu_tree.h
@ -1,33 +0,0 @@
-#ifndef GPU_TREE_H
-#define GPU_TREE_H
-
-#include <isl/schedule_node.h>
-
-#include "gpu.h"
-
-__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
-	__isl_take isl_schedule_node *node);
-int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node);
-__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
-__isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
-	__isl_take isl_schedule_node *node);
-__isl_give isl_schedule_node *gpu_tree_move_down_to_thread(
-	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
-__isl_give isl_schedule_node *gpu_tree_move_up_to_kernel(
-	__isl_take isl_schedule_node *node);
-__isl_give isl_schedule_node *gpu_tree_move_down_to_depth(
-	__isl_take isl_schedule_node *node, int depth,
-	__isl_keep isl_union_set *core);
-
-int gpu_tree_id_is_sync(__isl_keep isl_id *id, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_ensure_sync_after_core(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_ensure_following_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_move_left_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-__isl_give isl_schedule_node *gpu_tree_move_right_to_sync(
-	__isl_take isl_schedule_node *node, struct ppcg_kernel *kernel);
-
-#endif
--- a/polly/lib/External/ppcg/grouping.c
+++ b/polly/lib/External/ppcg/grouping.c
@ -1,684 +0,0 @@
-/*
- * Copyright 2016      Sven Verdoolaege
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege.
- */
-
-#include <isl/ctx.h>
-#include <isl/id.h>
-#include <isl/val.h>
-#include <isl/space.h>
-#include <isl/aff.h>
-#include <isl/set.h>
-#include <isl/map.h>
-#include <isl/union_set.h>
-#include <isl/union_map.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-
-#include "ppcg.h"
-
-/* Internal data structure for use during the detection of statements
- * that can be grouped.
- *
- * "sc" contains the original schedule constraints (not a copy).
- * "dep" contains the intersection of the validity and the proximity
- * constraints in "sc".  It may be NULL if it has not been computed yet.
- * "group_id" is the identifier for the next group that is extracted.
- *
- * "domain" is the set of statement instances that belong to any of the groups.
- * "contraction" maps the elements of "domain" to the corresponding group
- * instances.
- * "schedule" schedules the statements in each group relatively to each other.
- * These last three fields are NULL if no groups have been found so far.
- */
-struct ppcg_grouping {
-	isl_schedule_constraints *sc;
-
-	isl_union_map *dep;
-	int group_id;
-
-	isl_union_set *domain;
-	isl_union_pw_multi_aff *contraction;
-	isl_schedule *schedule;
-};
-
-/* Clear all memory allocated by "grouping".
- */
-static void ppcg_grouping_clear(struct ppcg_grouping *grouping)
-{
-	isl_union_map_free(grouping->dep);
-	isl_union_set_free(grouping->domain);
-	isl_union_pw_multi_aff_free(grouping->contraction);
-	isl_schedule_free(grouping->schedule);
-}
-
-/* Compute the intersection of the proximity and validity dependences
- * in grouping->sc and store the result in grouping->dep, unless
- * this intersection has been computed before.
- */
-static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping)
-{
-	isl_union_map *validity, *proximity;
-
-	if (grouping->dep)
-		return isl_stat_ok;
-
-	validity = isl_schedule_constraints_get_validity(grouping->sc);
-	proximity = isl_schedule_constraints_get_proximity(grouping->sc);
-	grouping->dep = isl_union_map_intersect(validity, proximity);
-
-	if (!grouping->dep)
-		return isl_stat_error;
-
-	return isl_stat_ok;
-}
-
-/* Information extracted from one or more consecutive leaves
- * in the input schedule.
- *
- * "list" contains the sets of statement instances in the leaves,
- * one element in the list for each original leaf.
- * "domain" contains the union of the sets in "list".
- * "prefix" contains the prefix schedule of these elements.
- */
-struct ppcg_grouping_leaf {
-	isl_union_set *domain;
-	isl_union_set_list *list;
-	isl_multi_union_pw_aff *prefix;
-};
-
-/* Free all memory allocated for "leaves".
- */
-static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[])
-{
-	int i;
-
-	if (!leaves)
-		return;
-
-	for (i = 0; i < n; ++i) {
-		isl_union_set_free(leaves[i].domain);
-		isl_union_set_list_free(leaves[i].list);
-		isl_multi_union_pw_aff_free(leaves[i].prefix);
-	}
-
-	free(leaves);
-}
-
-/* Short-hand for retrieving the prefix schedule at "node"
- * in the form of an isl_multi_union_pw_aff.
- */
-static __isl_give isl_multi_union_pw_aff *get_prefix(
-	__isl_keep isl_schedule_node *node)
-{
-	return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
-}
-
-/* Return an array of "n" elements with information extracted from
- * the "n" children of "node" starting at "first", all of which
- * are known to be filtered leaves.
- */
-struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node,
-	int first, int n)
-{
-	int i;
-	isl_ctx *ctx;
-	struct ppcg_grouping_leaf *leaves;
-
-	if (!node)
-		return NULL;
-
-	ctx = isl_schedule_node_get_ctx(node);
-	leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n);
-	if (!leaves)
-		return NULL;
-
-	for (i = 0; i < n; ++i) {
-		isl_schedule_node *child;
-		isl_union_set *domain;
-
-		child = isl_schedule_node_get_child(node, first + i);
-		child = isl_schedule_node_child(child, 0);
-		domain = isl_schedule_node_get_domain(child);
-		leaves[i].domain = isl_union_set_copy(domain);
-		leaves[i].list = isl_union_set_list_from_union_set(domain);
-		leaves[i].prefix = get_prefix(child);
-		isl_schedule_node_free(child);
-	}
-
-	return leaves;
-}
-
-/* Internal data structure used by merge_leaves.
- *
- * "src" and "dst" point to the two consecutive leaves that are
- * under investigation for being merged.
- * "merge" is initially set to 0 and is set to 1 as soon as
- * it turns out that it is useful to merge the two leaves.
- */
-struct ppcg_merge_leaves_data {
-	int merge;
-	struct ppcg_grouping_leaf *src;
-	struct ppcg_grouping_leaf *dst;
-};
-
-/* Given a relation "map" between instances of two statements A and B,
- * does it relate every instance of A (according to the domain of "src")
- * to every instance of B (according to the domain of "dst")?
- */
-static isl_bool covers_src_and_dst(__isl_keep isl_map *map,
-	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
-{
-	isl_space *space;
-	isl_set *set1, *set2;
-	isl_bool is_subset;
-
-	space = isl_space_domain(isl_map_get_space(map));
-	set1 = isl_union_set_extract_set(src->domain, space);
-	set2 = isl_map_domain(isl_map_copy(map));
-	is_subset = isl_set_is_subset(set1, set2);
-	isl_set_free(set1);
-	isl_set_free(set2);
-	if (is_subset < 0 || !is_subset)
-		return is_subset;
-
-	space = isl_space_range(isl_map_get_space(map));
-	set1 = isl_union_set_extract_set(dst->domain, space);
-	set2 = isl_map_range(isl_map_copy(map));
-	is_subset = isl_set_is_subset(set1, set2);
-	isl_set_free(set1);
-	isl_set_free(set2);
-
-	return is_subset;
-}
-
-/* Given a relation "map" between instances of two statements A and B,
- * are pairs of related instances executed together in the input schedule?
- * That is, is each pair of instances assigned the same value
- * by the corresponding prefix schedules?
- *
- * In particular, select the subset of "map" that has pairs of elements
- * with the same value for the prefix schedules and then check
- * if "map" is still a subset of the result.
- */
-static isl_bool matches_prefix(__isl_keep isl_map *map,
-	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
-{
-	isl_union_map *umap, *equal;
-	isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix;
-	isl_bool is_subset;
-
-	src_prefix = isl_multi_union_pw_aff_copy(src->prefix);
-	dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix);
-	prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix);
-
-	umap = isl_union_map_from_map(isl_map_copy(map));
-	equal = isl_union_map_copy(umap);
-	equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix);
-
-	is_subset = isl_union_map_is_subset(umap, equal);
-
-	isl_union_map_free(umap);
-	isl_union_map_free(equal);
-
-	return is_subset;
-}
-
-/* Given a set of validity and proximity schedule constraints "map"
- * between statements in consecutive leaves in a valid schedule,
- * should the two leaves be merged into one?
- *
- * In particular, the two are merged if the constraints form
- * a bijection between every instance of the first statement and
- * every instance of the second statement.  Moreover, each
- * pair of such dependent instances needs to be executed consecutively
- * in the input schedule.  That is, they need to be assigned
- * the same value by their prefix schedules.
- *
- * What this means is that for each instance of the first statement
- * there is exactly one instance of the second statement that
- * is executed immediately after the instance of the first statement and
- * that, moreover, both depends on this statement instance and
- * should be brought as close as possible to this statement instance.
- * In other words, it is both possible to execute the two instances
- * together (according to the input schedule) and desirable to do so
- * (according to the validity and proximity schedule constraints).
- */
-static isl_stat check_merge(__isl_take isl_map *map, void *user)
-{
-	struct ppcg_merge_leaves_data *data = user;
-	isl_bool ok;
-
-	ok = covers_src_and_dst(map, data->src, data->dst);
-	if (ok >= 0 && ok)
-		ok = isl_map_is_bijective(map);
-	if (ok >= 0 && ok)
-		ok = matches_prefix(map, data->src, data->dst);
-
-	isl_map_free(map);
-
-	if (ok < 0)
-		return isl_stat_error;
-	if (!ok)
-		return isl_stat_ok;
-
-	data->merge = 1;
-	return isl_stat_error;
-}
-
-/* Merge the leaves at position "pos" and "pos + 1" in "leaves".
- */
-static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[], int pos)
-{
-	int i;
-
-	leaves[pos].domain = isl_union_set_union(leaves[pos].domain,
-						leaves[pos + 1].domain);
-	leaves[pos].list = isl_union_set_list_concat(leaves[pos].list,
-						leaves[pos + 1].list);
-	leaves[pos].prefix = isl_multi_union_pw_aff_union_add(
-				leaves[pos].prefix, leaves[pos + 1].prefix);
-	for (i = pos + 1; i + 1 < n; ++i)
-		leaves[i] = leaves[i + 1];
-	leaves[n - 1].domain = NULL;
-	leaves[n - 1].list = NULL;
-	leaves[n - 1].prefix = NULL;
-
-	if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix)
-		return isl_stat_error;
-
-	return isl_stat_ok;
-}
-
-/* Merge pairs of consecutive leaves in "leaves" taking into account
- * the intersection of validity and proximity schedule constraints "dep".
- *
- * If a leaf has been merged with the next leaf, then the combination
- * is checked again for merging with the next leaf.
- * That is, if the leaves are A, B and C, then B may not have been
- * merged with C, but after merging A and B, it could still be useful
- * to merge the combination AB with C.
- *
- * Two leaves A and B are merged if there are instances of at least
- * one pair of statements, one statement in A and one B, such that
- * the validity and proximity schedule constraints between them
- * make them suitable for merging according to check_merge.
- *
- * Return the final number of leaves in the sequence, or -1 on error.
- */
-static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[],
-	__isl_keep isl_union_map *dep)
-{
-	int i;
-	struct ppcg_merge_leaves_data data;
-
-	for (i = n - 1; i >= 0; --i) {
-		isl_union_map *dep_i;
-		isl_stat ok;
-
-		if (i + 1 >= n)
-			continue;
-
-		dep_i = isl_union_map_copy(dep);
-		dep_i = isl_union_map_intersect_domain(dep_i,
-				isl_union_set_copy(leaves[i].domain));
-		dep_i = isl_union_map_intersect_range(dep_i,
-				isl_union_set_copy(leaves[i + 1].domain));
-		data.merge = 0;
-		data.src = &leaves[i];
-		data.dst = &leaves[i + 1];
-		ok = isl_union_map_foreach_map(dep_i, &check_merge, &data);
-		isl_union_map_free(dep_i);
-		if (ok < 0 && !data.merge)
-			return -1;
-		if (!data.merge)
-			continue;
-		if (merge_pair(n, leaves, i) < 0)
-			return -1;
-		--n;
-		++i;
-	}
-
-	return n;
-}
-
-/* Construct a schedule with "domain" as domain, that executes
- * the elements of "list" in order (as a sequence).
- */
-static __isl_give isl_schedule *schedule_from_domain_and_list(
-	__isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list)
-{
-	isl_schedule *schedule;
-	isl_schedule_node *node;
-
-	schedule = isl_schedule_from_domain(isl_union_set_copy(domain));
-	node = isl_schedule_get_root(schedule);
-	isl_schedule_free(schedule);
-	node = isl_schedule_node_child(node, 0);
-	list = isl_union_set_list_copy(list);
-	node = isl_schedule_node_insert_sequence(node, list);
-	schedule = isl_schedule_node_get_schedule(node);
-	isl_schedule_node_free(node);
-
-	return schedule;
-}
-
-/* Construct a unique identifier for a group in "grouping".
- *
- * The name is of the form G_n, with n the first value starting at
- * grouping->group_id that does not result in an identifier
- * that is already in use in the domain of the original schedule
- * constraints.
- */
-static isl_id *construct_group_id(struct ppcg_grouping *grouping,
-	__isl_take isl_space *space)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_bool empty;
-	isl_union_set *domain;
-
-	if (!space)
-		return NULL;
-
-	ctx = isl_space_get_ctx(space);
-	domain = isl_schedule_constraints_get_domain(grouping->sc);
-
-	do {
-		char buffer[20];
-		isl_id *id;
-		isl_set *set;
-
-		snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id);
-		grouping->group_id++;
-		id = isl_id_alloc(ctx, buffer, NULL);
-		space = isl_space_set_tuple_id(space, isl_dim_set, id);
-		set = isl_union_set_extract_set(domain, isl_space_copy(space));
-		empty = isl_set_plain_is_empty(set);
-		isl_set_free(set);
-	} while (empty >= 0 && !empty);
-
-	if (empty < 0)
-		space = isl_space_free(space);
-
-	id = isl_space_get_tuple_id(space, isl_dim_set);
-
-	isl_space_free(space);
-	isl_union_set_free(domain);
-
-	return id;
-}
-
-/* Construct a contraction from "prefix" and "domain" for a new group
- * in "grouping".
- *
- * The values of the prefix schedule "prefix" are used as instances
- * of the new group.  The identifier of the group is constructed
- * in such a way that it does not conflict with those of earlier
- * groups nor with statements in the domain of the original
- * schedule constraints.
- * The isl_multi_union_pw_aff "prefix" then simply needs to be
- * converted to an isl_union_pw_multi_aff.  However, this is not
- * possible if "prefix" is zero-dimensional, so in this case,
- * a contraction is constructed from "domain" instead.
- */
-static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain(
-	struct ppcg_grouping *grouping,
-	__isl_keep isl_multi_union_pw_aff *prefix,
-	__isl_keep isl_union_set *domain)
-{
-	isl_id *id;
-	isl_space *space;
-	int dim;
-
-	space = isl_multi_union_pw_aff_get_space(prefix);
-	if (!space)
-		return NULL;
-	dim = isl_space_dim(space, isl_dim_set);
-	id = construct_group_id(grouping, space);
-	if (dim == 0) {
-		isl_multi_val *mv;
-
-		space = isl_multi_union_pw_aff_get_space(prefix);
-		space = isl_space_set_tuple_id(space, isl_dim_set, id);
-		mv = isl_multi_val_zero(space);
-		domain = isl_union_set_copy(domain);
-		return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv);
-	}
-	prefix = isl_multi_union_pw_aff_copy(prefix);
-	prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id);
-	return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix);
-}
-
-/* Extend "grouping" with groups corresponding to merged
- * leaves in the list of potentially merged leaves "leaves".
- *
- * The "list" field of each element in "leaves" contains a list
- * of the instances sets of the original leaves that have been
- * merged into this element.  If at least two of the original leaves
- * have been merged into a given element, then add the corresponding
- * group to "grouping".
- * In particular, the domain is extended with the statement instances
- * of the merged leaves, the contraction is extended with a mapping
- * of these statement instances to instances of a new group and
- * the schedule is extended with a schedule that executes
- * the statement instances according to the order of the leaves
- * in which they appear.
- * Since the instances of the groups should already be scheduled apart
- * in the schedule into which this schedule will be plugged in,
- * the schedules of the individual groups are combined independently
- * of each other (as a set).
- */
-static isl_stat add_groups(struct ppcg_grouping *grouping,
-	int n, struct ppcg_grouping_leaf leaves[])
-{
-	int i;
-
-	for (i = 0; i < n; ++i) {
-		int n_leaf;
-		isl_schedule *schedule;
-		isl_union_set *domain;
-		isl_union_pw_multi_aff *upma;
-
-		n_leaf = isl_union_set_list_n_union_set(leaves[i].list);
-		if (n_leaf < 0)
-			return isl_stat_error;
-		if (n_leaf <= 1)
-			continue;
-		schedule = schedule_from_domain_and_list(leaves[i].domain,
-							leaves[i].list);
-		upma = group_contraction_from_prefix_and_domain(grouping,
-					leaves[i].prefix, leaves[i].domain);
-
-		domain = isl_union_set_copy(leaves[i].domain);
-		if (grouping->domain) {
-			domain = isl_union_set_union(domain, grouping->domain);
-			upma = isl_union_pw_multi_aff_union_add(upma,
-						grouping->contraction);
-			schedule = isl_schedule_set(schedule,
-						grouping->schedule);
-		}
-		grouping->domain = domain;
-		grouping->contraction = upma;
-		grouping->schedule = schedule;
-
-		if (!grouping->domain || !grouping->contraction ||
-		    !grouping->schedule)
-			return isl_stat_error;
-	}
-
-	return isl_stat_ok;
-}
-
-/* Look for any pairs of consecutive leaves among the "n" children of "node"
- * starting at "first" that should be merged together.
- * Store the results in "grouping".
- *
- * First make sure the intersection of validity and proximity
- * schedule constraints is available and extract the required
- * information from the "n" leaves.
- * Then try and merge consecutive leaves based on the validity
- * and proximity constraints.
- * If any pairs were successfully merged, then add groups
- * corresponding to the merged leaves to "grouping".
- */
-static isl_stat group_subsequence(__isl_keep isl_schedule_node *node,
-	int first, int n, struct ppcg_grouping *grouping)
-{
-	int n_merge;
-	struct ppcg_grouping_leaf *leaves;
-
-	if (ppcg_grouping_compute_dep(grouping) < 0)
-		return isl_stat_error;
-
-	leaves = extract_leaves(node, first, n);
-	if (!leaves)
-		return isl_stat_error;
-
-	n_merge = merge_leaves(n, leaves, grouping->dep);
-	if (n_merge >= 0 && n_merge < n &&
-	    add_groups(grouping, n_merge, leaves) < 0)
-		return isl_stat_error;
-
-	ppcg_grouping_leaf_free(n, leaves);
-
-	return isl_stat_ok;
-}
-
-/* If "node" is a sequence, then check if it has any consecutive
- * leaves that should be merged together and store the results
- * in "grouping".
- *
- * In particular, call group_subsequence on each consecutive
- * sequence of (filtered) leaves among the children of "node".
- */
-static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user)
-{
-	int i, n, first;
-	struct ppcg_grouping *grouping = user;
-
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
-		return isl_bool_true;
-
-	n = isl_schedule_node_n_children(node);
-	if (n < 0)
-		return isl_bool_error;
-
-	first = -1;
-	for (i = 0; i < n; ++i) {
-		isl_schedule_node *child;
-		enum isl_schedule_node_type type;
-
-		child = isl_schedule_node_get_child(node, i);
-		child = isl_schedule_node_child(child, 0);
-		type = isl_schedule_node_get_type(child);
-		isl_schedule_node_free(child);
-
-		if (first >= 0 && type != isl_schedule_node_leaf) {
-			if (group_subsequence(node, first, i - first,
-						grouping) < 0)
-				return isl_bool_error;
-			first = -1;
-		}
-		if (first < 0 && type == isl_schedule_node_leaf)
-			first = i;
-	}
-	if (first >= 0) {
-		if (group_subsequence(node, first, n - first, grouping) < 0)
-			return isl_bool_error;
-	}
-
-	return isl_bool_true;
-}
-
-/* Complete "grouping" to cover all statement instances in the domain
- * of grouping->sc.
- *
- * In particular, grouping->domain is set to the full set of statement
- * instances; group->contraction is extended with an identity
- * contraction on the additional instances and group->schedule
- * is extended with an independent schedule on those additional instances.
- * In the extension of group->contraction, the additional instances
- * are split into those belong to different statements and those
- * that belong to some of the same statements.  The first group
- * is replaced by its universe in order to simplify the contraction extension.
- */
-static void complete_grouping(struct ppcg_grouping *grouping)
-{
-	isl_union_set *domain, *left, *overlap;
-	isl_union_pw_multi_aff *upma;
-	isl_schedule *schedule;
-
-	domain = isl_schedule_constraints_get_domain(grouping->sc);
-	left = isl_union_set_subtract(isl_union_set_copy(domain),
-				    isl_union_set_copy(grouping->domain));
-	schedule = isl_schedule_from_domain(isl_union_set_copy(left));
-	schedule = isl_schedule_set(schedule, grouping->schedule);
-	grouping->schedule = schedule;
-
-	overlap = isl_union_set_universe(grouping->domain);
-	grouping->domain = domain;
-	overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap);
-	left = isl_union_set_subtract(left, isl_union_set_copy(overlap));
-	left = isl_union_set_universe(left);
-	left = isl_union_set_union(left, overlap);
-	upma = isl_union_set_identity_union_pw_multi_aff(left);
-	upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction);
-	grouping->contraction = upma;
-}
-
-/* Compute a schedule on the domain of "sc" that respects the schedule
- * constraints in "sc".
- *
- * "schedule" is a known correct schedule that is used to combine
- * groups of statements if options->group_chains is set.
- * In particular, statements that are executed consecutively in a sequence
- * in this schedule and where all instances of the second depend on
- * the instance of the first that is executed in the same iteration
- * of outer band nodes are grouped together into a single statement.
- * The schedule constraints are then mapped to these groups of statements
- * and the resulting schedule is expanded again to refer to the original
- * statements.
- */
-__isl_give isl_schedule *ppcg_compute_schedule(
-	__isl_take isl_schedule_constraints *sc,
-	__isl_keep isl_schedule *schedule, struct ppcg_options *options)
-{
-	struct ppcg_grouping grouping = { sc };
-	isl_union_pw_multi_aff *contraction;
-	isl_union_map *umap;
-	isl_schedule *res, *expansion;
-
-	if (!options->group_chains)
-		return isl_schedule_constraints_compute_schedule(sc);
-
-	grouping.group_id = 0;
-	if (isl_schedule_foreach_schedule_node_top_down(schedule,
-			&detect_groups, &grouping) < 0)
-		goto error;
-	if (!grouping.contraction) {
-		ppcg_grouping_clear(&grouping);
-		return isl_schedule_constraints_compute_schedule(sc);
-	}
-	complete_grouping(&grouping);
-	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
-	umap = isl_union_map_from_union_pw_multi_aff(contraction);
-
-	sc = isl_schedule_constraints_apply(sc, umap);
-
-	res = isl_schedule_constraints_compute_schedule(sc);
-
-	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
-	expansion = isl_schedule_copy(grouping.schedule);
-	res = isl_schedule_expand(res, contraction, expansion);
-
-	ppcg_grouping_clear(&grouping);
-	return res;
-error:
-	ppcg_grouping_clear(&grouping);
-	isl_schedule_constraints_free(sc);
-	return NULL;
-}
--- a/polly/lib/External/ppcg/hybrid.c
+++ b/polly/lib/External/ppcg/hybrid.c
--- a/polly/lib/External/ppcg/hybrid.h
+++ b/polly/lib/External/ppcg/hybrid.h
@ -1,41 +0,0 @@
-#ifndef HYBRID_H
-#define HYBRID_H
-
-#include <isl/val.h>
-#include <isl/schedule_node.h>
-
-#include "ppcg.h"
-
-struct ppcg_ht_bounds;
-typedef struct ppcg_ht_bounds ppcg_ht_bounds;
-
-struct ppcg_ht_phase;
-typedef struct ppcg_ht_phase ppcg_ht_phase;
-
-isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node);
-isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node);
-
-__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
-	__isl_keep isl_schedule_node *node);
-void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds);
-isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds);
-isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
-	__isl_keep isl_multi_val *sizes);
-__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
-	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
-	__isl_take isl_schedule_node *node, struct ppcg_options *options);
-__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
-	__isl_take ppcg_ht_bounds *bounds);
-
-__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
-	__isl_keep isl_schedule_node *node);
-__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
-	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node);
-__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
-	__isl_take isl_schedule_node *node,
-	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
-		void *user), void *user);
-__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
-	__isl_take isl_schedule_node *node);
-
-#endif
--- a/polly/lib/External/ppcg/ocl_utilities.c
+++ b/polly/lib/External/ppcg/ocl_utilities.c
@ -1,174 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "ocl_utilities.h"
-
-/* Return the OpenCL error string for a given error number.
- */
-const char *opencl_error_string(cl_int error)
-{
-	int errorCount;
-	int index;
-
-	static const char *errorString[] = {
-		[CL_SUCCESS] = "CL_SUCCESS",
-		[-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
-		[-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
-		[-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
-		[-CL_MEM_OBJECT_ALLOCATION_FAILURE] =
-			"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		[-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
-		[-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
-		[-CL_PROFILING_INFO_NOT_AVAILABLE] =
-			"CL_PROFILING_INFO_NOT_AVAILABLE",
-		[-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
-		[-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
-		[-CL_IMAGE_FORMAT_NOT_SUPPORTED] =
-			"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		[-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
-		[-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
-		[-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
-		[-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
-		[-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
-		[-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
-		[-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
-		[-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
-		[-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
-		[-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
-		[-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT",
-		[-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] =
-			"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		[-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
-		[-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
-		[-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
-		[-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
-		[-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
-		[-CL_INVALID_PROGRAM_EXECUTABLE] =
-			"CL_INVALID_PROGRAM_EXECUTABLE",
-		[-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
-		[-CL_INVALID_KERNEL_DEFINITION] =
-			"CL_INVALID_KERNEL_DEFINITION",
-		[-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
-		[-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
-		[-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
-		[-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
-		[-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
-		[-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
-		[-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
-		[-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
-		[-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
-		[-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
-		[-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
-		[-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
-		[-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
-		[-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
-		[-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
-		[-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
-		[-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
-	};
-
-	errorCount = sizeof(errorString) / sizeof(errorString[0]);
-	index = -error;
-
-	return (index >= 0 && index < errorCount) ?
-		errorString[index] : "Unspecified Error";
-}
-
-/* Find a GPU or a CPU associated with the first available platform.
- * If use_gpu is set, then this function first tries to look for a GPU
- * in the first available platform.
- * If this fails or if use_gpu is not set, then it tries to use the CPU.
- */
-cl_device_id opencl_create_device(int use_gpu)
-{
-	cl_platform_id platform;
-	cl_device_id dev;
-	int err;
-
-	err = clGetPlatformIDs(1, &platform, NULL);
-	if (err < 0) {
-		fprintf(stderr, "Error %s while looking for a platform.\n",
-				opencl_error_string(err));
-		exit(1);
-	}
-
-	err = CL_DEVICE_NOT_FOUND;
-	if (use_gpu)
-		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev,
-				NULL);
-	if (err == CL_DEVICE_NOT_FOUND)
-		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev,
-				NULL);
-	if (err < 0) {
-		fprintf(stderr, "Error %s while looking for a device.\n",
-				opencl_error_string(err));
-		exit(1);
-	}
-	return dev;
-}
-
-/* Create an OpenCL program from a string and compile it.
- */
-cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
-	const char *program_source, size_t program_size,
-	const char *opencl_options)
-{
-	int err;
-	cl_program program;
-	char *program_log;
-	size_t log_size;
-
-	program = clCreateProgramWithSource(ctx, 1,
-			&program_source, &program_size, &err);
-	if (err < 0) {
-		fprintf(stderr, "Could not create the program\n");
-		exit(1);
-	}
-	err = clBuildProgram(program, 0, NULL, opencl_options, NULL, NULL);
-	if (err < 0) {
-		fprintf(stderr, "Could not build the program.\n");
-		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0,
-				NULL, &log_size);
-		program_log = (char *) malloc(log_size + 1);
-		program_log[log_size] = '\0';
-		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
-				log_size + 1, program_log, NULL);
-		fprintf(stderr, "%s\n", program_log);
-		free(program_log);
-		exit(1);
-	}
-	return program;
-}
-
-/* Create an OpenCL program from a source file and compile it.
- */
-cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
-	const char* filename, const char* opencl_options)
-{
-	cl_program program;
-	FILE *program_file;
-	char *program_source;
-	size_t program_size, read;
-
-	program_file = fopen(filename, "r");
-	if (program_file == NULL) {
-		fprintf(stderr, "Could not find the source file.\n");
-		exit(1);
-	}
-	fseek(program_file, 0, SEEK_END);
-	program_size = ftell(program_file);
-	rewind(program_file);
-	program_source = (char *) malloc(program_size + 1);
-	program_source[program_size] = '\0';
-	read = fread(program_source, sizeof(char), program_size, program_file);
-	if (read != program_size) {
-		fprintf(stderr, "Error while reading the kernel.\n");
-		exit(1);
-	}
-	fclose(program_file);
-
-	program = opencl_build_program_from_string(ctx, dev, program_source,
-						program_size, opencl_options);
-	free(program_source);
-
-	return program;
-}
--- a/polly/lib/External/ppcg/ocl_utilities.h
+++ b/polly/lib/External/ppcg/ocl_utilities.h
@ -1,32 +0,0 @@
-#ifndef OCL_UTILITIES_H
-#define OCL_UTILITIES_H
-
-#if defined(__APPLE__)
-#include <OpenCL/opencl.h>
-#else
-#include <CL/opencl.h>
-#endif
-
-/* Return the OpenCL error string for a given error number.
- */
-const char *opencl_error_string(cl_int error);
-
-/* Find a GPU or a CPU associated with the first available platform.
- * If use_gpu is set, then this function first tries to look for a GPU
- * in the first available platform.
- * If this fails or if use_gpu is not set, then it tries to use the CPU.
- */
-cl_device_id opencl_create_device(int use_gpu);
-
-/* Create an OpenCL program from a string and compile it.
- */
-cl_program opencl_build_program_from_string(cl_context ctx, cl_device_id dev,
-	const char *program_source, size_t program_size,
-	const char *opencl_options);
-
-/* Create an OpenCL program from a source file and compile it.
- */
-cl_program opencl_build_program_from_file(cl_context ctx, cl_device_id dev,
-	const char* filename, const char* opencl_options);
-
-#endif
--- a/polly/lib/External/ppcg/opencl.h
+++ b/polly/lib/External/ppcg/opencl.h
@ -1,11 +0,0 @@
-#ifndef _OPENCL_H
-#define _OPENCL_H
-
-#include <pet.h>
-#include "ppcg_options.h"
-#include "ppcg.h"
-
-int generate_opencl(isl_ctx *ctx, struct ppcg_options *options,
-	const char *input, const char *output);
-
-#endif
--- a/polly/lib/External/ppcg/opencl_test.sh.in
+++ b/polly/lib/External/ppcg/opencl_test.sh.in
@ -1,78 +0,0 @@
-#!/bin/sh
-
-keep=no
-
-for option; do
-	case "$option" in
-		--keep)
-			keep=yes
-			;;
-	esac
-done
-
-EXEEXT=@EXEEXT@
-VERSION=@GIT_HEAD_VERSION@
-CC="@CC@"
-CFLAGS="--std=gnu99"
-srcdir="@srcdir@"
-
-if [ $keep = "yes" ]; then
-	OUTDIR="opencl_test.$VERSION"
-	mkdir "$OUTDIR" || exit 1
-else
-	if test "x$TMPDIR" = "x"; then
-		TMPDIR=/tmp
-	fi
-	OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
-fi
-
-run_tests () {
-	subdir=$1
-	ppcg_options=$2
-
-	echo Test with PPCG options \'$ppcg_options\'
-	mkdir ${OUTDIR}/${subdir} || exit 1
-	for i in $srcdir/tests/*.c; do
-		echo $i
-		name=`basename $i`
-		name="${name%.c}"
-		out_c="${OUTDIR}/${subdir}/$name.ppcg.c"
-		out="${OUTDIR}/${subdir}/$name.ppcg$EXEEXT"
-		options="--target=opencl --opencl-no-use-gpu $ppcg_options"
-		functions="$srcdir/tests/${name}_opencl_functions.cl"
-		if test -f $functions; then
-			options="$options --opencl-include-file=$functions"
-			options="$options --opencl-compiler-options=-I."
-		fi
-		./ppcg$EXEEXT $options $i -o "$out_c" || exit
-		$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
-			-I. "$out_c" -o "$out" || exit
-		$out || exit
-	done
-}
-
-run_tests default
-run_tests embed --opencl-embed-kernel-code
-
-for i in $srcdir/examples/*.c; do
-	echo $i
-	name=`basename $i`
-	name="${name%.c}"
-	exe_ref="${OUTDIR}/$name.ref$EXEEXT"
-	gen_ocl="${OUTDIR}/$name.ppcg.c"
-	exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT"
-	output_ref="${OUTDIR}/$name.ref.out"
-	output_ocl="${OUTDIR}/$name.ppcg.out"
-	$CC $CFLAGS $i -o $exe_ref || exit
-	./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \
-		exit
-	$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
-		"$gen_ocl" -o "$exe_ocl" || exit
-	$exe_ref > $output_ref || exit
-	$exe_ocl > $output_ocl || exit
-	cmp $output_ref $output_ocl || exit
-done
-
-if [ $keep = "no" ]; then
-	rm -r "${OUTDIR}"
-fi
--- a/polly/lib/External/ppcg/polybench_test.sh.in
+++ b/polly/lib/External/ppcg/polybench_test.sh.in
@ -1,109 +0,0 @@
-#!/bin/sh
-
-keep=no
-verbose=no
-
-for option; do
-	case "$option" in
-		--keep)
-			keep=yes
-			;;
-		--verbose)
-			verbose=yes
-			;;
-	esac
-done
-
-EXEEXT=@EXEEXT@
-DIR=@POLYBENCH_DIR@
-VERSION=@GIT_HEAD_VERSION@
-SIZE=-DMINI_DATASET
-CC="@CC@"
-HAVE_OPENCL=@HAVE_OPENCL@
-HAVE_OPENMP=@HAVE_OPENMP@
-srcdir="@srcdir@"
-if [ $keep = "yes" ]; then
-	OUTDIR="out.$VERSION"
-	mkdir "$OUTDIR" || exit 1
-else
-	if test "x$TMPDIR" = "x"; then
-		TMPDIR=/tmp
-	fi
-	OUTDIR=`mktemp -d $TMPDIR/ppcg.XXXXXXXXXX` || exit 1
-fi
-CPPFLAGS="-DPOLYBENCH_USE_C99_PROTO -DPOLYBENCH_DUMP_ARRAYS"
-CPPFLAGS="$CPPFLAGS $SIZE -I $DIR/utilities"
-CFLAGS="-lm --std=gnu99"
-
-echo "Running tests in folder ${OUTDIR}"
-
-run_tests () {
-	ext=$1
-
-	ppcg_options=$2
-	cc_options=$3
-
-	if [ "x$ppcg_options" = "x" ]; then
-		ppcg_option_str="none"
-	else
-		ppcg_option_str=$ppcg_options
-	fi
-
-	if [ "x$cc_options" = "x" ]; then
-		cc_option_str="none"
-	else
-		cc_option_str=$cc_options
-	fi
-
-	echo Test: $ext, ppcg options: $ppcg_option_str, CC options: $cc_option_str
-	for i in `cat $DIR/utilities/benchmark_list`; do
-		echo $i
-		name=`basename $i`
-		name=${name%.c}
-		source_opt="${OUTDIR}/$name.$ext.c"
-		prog_orig=${OUTDIR}/$name.orig${EXEEXT}
-		prog_opt=${OUTDIR}/$name.$ext${EXEEXT}
-		output_orig=${OUTDIR}/$name.orig.out
-		output_opt=${OUTDIR}/$name.$ext.out
-		dir=`dirname $i`
-		if [ $verbose = "yes" ]; then
-			echo ./ppcg$EXEEXT -I $DIR/$dir $DIR/$i \
-				$CPPFLAGS -o $source_opt $ppcg_options
-		fi
-		./ppcg$EXEEXT -I $DIR/$dir $DIR/$i $CPPFLAGS \
-			-o $source_opt $ppcg_options || exit
-		$CC -I $DIR/$dir $CPPFLAGS $DIR/$i -o $prog_orig \
-			$DIR/utilities/polybench.c $CFLAGS
-		$prog_orig 2> $output_orig
-		if [ $verbose = "yes" ]; then
-			echo $CC -I $DIR/$dir $CPPFLAGS $source_opt \
-				-o $prog_opt $DIR/utilities/polybench.c \
-				$CFLAGS $cc_options
-		fi
-		$CC -I $DIR/$dir $CPPFLAGS $source_opt -o $prog_opt \
-			$DIR/utilities/polybench.c $CFLAGS $cc_options || exit
-
-		$prog_opt 2> $output_opt
-		cmp $output_orig $output_opt || exit
-	done
-}
-
-run_tests ppcg "--target=c --tile"
-run_tests ppcg_live "--target=c --no-live-range-reordering --tile"
-
-# Test OpenMP code, if compiler supports openmp
-if [ $HAVE_OPENMP = "yes" ]; then
-	run_tests ppcg_omp "--target=c --openmp" -fopenmp
-	echo Introduced `grep -R 'omp parallel' "${OUTDIR}" | wc -l` '"pragma omp parallel for"'
-else
-	echo Compiler does not support OpenMP. Skipping OpenMP tests.
-fi
-
-if [ $HAVE_OPENCL = "yes" ]; then
-	run_tests ppcg_opencl "--target=opencl --opencl-no-use-gpu" \
-				"-I $srcdir $srcdir/ocl_utilities.c -lOpenCL"
-fi
-
-if [ $keep = "no" ]; then
-	rm -r "${OUTDIR}"
-fi
--- a/polly/lib/External/ppcg/ppcg.c
+++ b/polly/lib/External/ppcg/ppcg.c
--- a/polly/lib/External/ppcg/ppcg.h
+++ b/polly/lib/External/ppcg/ppcg.h
@ -1,128 +0,0 @@
-#ifndef PPCG_H
-#define PPCG_H
-
-#include <isl/schedule.h>
-#include <isl/set.h>
-#include <isl/union_set.h>
-#include <isl/union_map.h>
-#include <isl/id_to_ast_expr.h>
-#include <pet.h>
-
-#include "ppcg_options.h"
-
-const char *ppcg_base_name(const char *filename);
-int ppcg_extract_base_name(char *name, const char *input);
-
-/* Representation of the scop for use inside PPCG.
- *
- * "options" are the options specified by the user.
- * Some fields in this structure may depend on some of the options.
- *
- * "start" and "end" are file offsets of the corresponding program text.
- * "context" represents constraints on the parameters.
- * "domain" is the union of all iteration domains.
- * "call" contains the iteration domains of statements with a call expression.
- * "reads" contains all potential read accesses.
- * "tagged_reads" is the same as "reads", except that the domain is a wrapped
- *	relation mapping an iteration domain to a reference identifier
- * "live_in" contains the potential read accesses that potentially
- *	have no corresponding writes in the scop.
- * "may_writes" contains all potential write accesses.
- * "tagged_may_writes" is the same as "may_writes", except that the domain
- *	is a wrapped relation mapping an iteration domain
- *	to a reference identifier
- * "must_writes" contains all definite write accesses.
- * "tagged_must_writes" is the same as "must_writes", except that the domain
- *	is a wrapped relation mapping an iteration domain
- *	to a reference identifier
- * "live_out" contains the potential write accesses that are potentially
- *	not killed by any kills or any other writes.
- * "must_kills" contains all definite kill accesses.
- * "tagged_must_kills" is the same as "must_kills", except that the domain
- *	is a wrapped relation mapping an iteration domain
- *	to a reference identifier.
- *
- * "tagger" maps tagged iteration domains to the corresponding untagged
- *	iteration domain.
- *
- * "independence" is the union of all independence filters.
- *
- * "dep_flow" represents the potential flow dependences.
- * "tagged_dep_flow" is the same as "dep_flow", except that both domain and
- *	range are wrapped relations mapping an iteration domain to
- *	a reference identifier.  May be NULL if not computed.
- * "dep_false" represents the potential false (anti and output) dependences.
- * "dep_forced" represents the validity constraints that should be enforced
- *	even when live-range reordering is used.
- *	In particular, these constraints ensure that all live-in
- *	accesses remain live-in and that all live-out accesses remain live-out
- *	and that multiple potential sources for the same read are
- *	executed in the original order.
- * "dep_order"/"tagged_dep_order" represents the order dependences between
- *	the live range intervals in "dep_flow"/"tagged_dep_flow".
- *	It is only used if the live_range_reordering
- *	option is set.  Otherwise it is NULL.
- *	If "dep_order" is used, then "dep_false" only contains a limited
- *	set of anti and output dependences.
- * "schedule" represents the (original) schedule.
- *
- * "names" contains all variable names that are in use by the scop.
- * The names are mapped to a dummy value.
- *
- * "pet" is the original pet_scop.
- */
-struct ppcg_scop {
-	struct ppcg_options *options;
-
-	unsigned start;
-	unsigned end;
-
-	isl_set *context;
-	isl_union_set *domain;
-	isl_union_set *call;
-	isl_union_map *tagged_reads;
-	isl_union_map *reads;
-	isl_union_map *live_in;
-	isl_union_map *tagged_may_writes;
-	isl_union_map *may_writes;
-	isl_union_map *tagged_must_writes;
-	isl_union_map *must_writes;
-	isl_union_map *live_out;
-	isl_union_map *tagged_must_kills;
-	isl_union_map *must_kills;
-
-	isl_union_pw_multi_aff *tagger;
-
-	isl_union_map *independence;
-
-	isl_union_map *dep_flow;
-	isl_union_map *tagged_dep_flow;
-	isl_union_map *dep_false;
-	isl_union_map *dep_forced;
-	isl_union_map *dep_order;
-	isl_union_map *tagged_dep_order;
-	isl_schedule *schedule;
-
-	isl_id_to_ast_expr *names;
-
-	struct pet_scop *pet;
-};
-
-int ppcg_scop_any_hidden_declarations(struct ppcg_scop *scop);
-__isl_give isl_id_list *ppcg_scop_generate_names(struct ppcg_scop *scop,
-	int n, const char *prefix);
-
-int ppcg_transform(isl_ctx *ctx, const char *input, FILE *out,
-	struct ppcg_options *options,
-	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
-		struct ppcg_scop *scop, void *user), void *user);
-
-__isl_give isl_schedule *ppcg_compute_schedule(
-	__isl_take isl_schedule_constraints *sc,
-	__isl_keep isl_schedule *schedule, struct ppcg_options *options);
-
-void compute_tagger(struct ppcg_scop *ps);
-void compute_dependences(struct ppcg_scop *scop);
-void eliminate_dead_code(struct ppcg_scop *ps);
-void *ppcg_scop_free(struct ppcg_scop *ps);
-#endif
--- a/polly/lib/External/ppcg/ppcg_options.c
+++ b/polly/lib/External/ppcg/ppcg_options.c
@ -1,136 +0,0 @@
-/*
- * Copyright 2010-2011 INRIA Saclay
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- */
-
-#include "ppcg_options.h"
-
-static struct isl_arg_choice target[] = {
-	{"c",		PPCG_TARGET_C},
-	{"cuda",	PPCG_TARGET_CUDA},
-	{"opencl",      PPCG_TARGET_OPENCL},
-	{0}
-};
-
-/* Set defaults that depend on the target.
- * In particular, set --schedule-outer-coincidence iff target is a GPU.
- */
-void ppcg_options_set_target_defaults(struct ppcg_options *options)
-{
-	char *argv[2] = { NULL };
-
-	argv[0] = "ppcg_options_set_target_defaults";
-	if (options->target == PPCG_TARGET_C)
-		argv[1] = "--no-schedule-outer-coincidence";
-	else
-		argv[1] = "--schedule-outer-coincidence";
-
-	isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL);
-}
-
-/* Callback that is called whenever the "target" option is set (to "val").
- * The callback is called after target has been updated.
- *
- * Call ppcg_options_set_target_defaults to reset the target-dependent options.
- */
-static int set_target(void *opt, unsigned val)
-{
-	struct ppcg_options *options = opt;
-
-	ppcg_options_set_target_defaults(options);
-
-	return 0;
-}
-
-ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args)
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0,
-	"dump-schedule-constraints", 0, "dump schedule constraints")
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule, 0,
-	"dump-schedule", 0, "dump isl computed schedule")
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_final_schedule, 0,
-	"dump-final-schedule", 0, "dump PPCG computed schedule")
-ISL_ARG_BOOL(struct ppcg_debug_options, dump_sizes, 0,
-	"dump-sizes", 0,
-	"dump effectively used per kernel tile, grid and block sizes")
-ISL_ARG_BOOL(struct ppcg_debug_options, verbose, 'v', "verbose", 0, NULL)
-ISL_ARGS_END
-
-ISL_ARGS_START(struct ppcg_options, ppcg_opencl_options_args)
-ISL_ARG_STR(struct ppcg_options, opencl_compiler_options, 0, "compiler-options",
-	"options", NULL, "options to pass to the OpenCL compiler")
-ISL_ARG_BOOL(struct ppcg_options, opencl_use_gpu, 0, "use-gpu", 1,
-	"use GPU device (if available)")
-ISL_ARG_STR_LIST(struct ppcg_options, opencl_n_include_file,
-	opencl_include_files, 0, "include-file", "filename",
-	"file to #include in generated OpenCL code")
-ISL_ARG_BOOL(struct ppcg_options, opencl_print_kernel_types, 0,
-	"print-kernel-types", 1,
-	"print definitions of types in the kernel file")
-ISL_ARG_BOOL(struct ppcg_options, opencl_embed_kernel_code, 0,
-	"embed-kernel-code", 0, "embed kernel code into host code")
-ISL_ARGS_END
-
-ISL_ARGS_START(struct ppcg_options, ppcg_options_args)
-ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options")
-ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args,
-	"debugging options")
-ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1,
-	"group chains of interdependent statements that are executed "
-	"consecutively in the original schedule before scheduling")
-ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1,
-	"replace original schedule by isl computed schedule")
-ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0,
-	"scale-tile-loops", 1, NULL)
-ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL)
-ISL_ARG_BOOL(struct ppcg_options, use_shared_memory, 0, "shared-memory", 1,
-	"use shared memory in kernel code")
-ISL_ARG_BOOL(struct ppcg_options, use_private_memory, 0, "private-memory", 1,
-	"use private memory in kernel code")
-ISL_ARG_STR(struct ppcg_options, ctx, 0, "ctx", "context", NULL,
-    "Constraints on parameters")
-ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0,
-	"assume-non-negative-parameters", 0,
-	"assume all parameters are non-negative)")
-ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0,
-	"perform tiling (C target)")
-ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL)
-ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles",
-	0, "isolate full tiles from partial tiles (hybrid tiling)")
-ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL,
-	"Per kernel tile, grid and block sizes")
-ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0,
-	"max-shared-memory", "size", 8192, "maximal amount of shared memory")
-ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0,
-	"Generate OpenMP macros (only for C target)")
-ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target,
-	&set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA,
-	"the target to generate code for")
-ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0,
-	"linearize-device-arrays", 1,
-	"linearize all device arrays, even those of fixed size")
-ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0,
-	"allow-gnu-extensions", 1,
-	"allow the use of GNU extensions in generated code")
-ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0,
-	"live-range-reordering", 1,
-	"allow successive live ranges on the same memory element "
-	"to be reordered")
-ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0,
-	"apply hybrid tiling whenever a suitable input pattern is found "
-	"(GPU targets)")
-ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared",
-	0, "unroll code for copying to/from shared memory")
-ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0,
-	"unroll code inside tile on GPU targets")
-ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options")
-ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule",
-	"file", NULL, "save isl computed schedule to <file>")
-ISL_ARG_STR(struct ppcg_options, load_schedule_file, 0, "load-schedule",
-	"file", NULL, "load schedule from <file>, "
-	"using it instead of an isl computed schedule")
-ISL_ARGS_END
--- a/polly/lib/External/ppcg/ppcg_options.h
+++ b/polly/lib/External/ppcg/ppcg_options.h
@ -1,100 +0,0 @@
-#ifndef PPCG_OPTIONS_H
-#define PPCG_OPTIONS_H
-
-#include <isl/arg.h>
-#include <isl/options.h>
-
-struct ppcg_debug_options {
-	int dump_schedule_constraints;
-	int dump_schedule;
-	int dump_final_schedule;
-	int dump_sizes;
-	int verbose;
-};
-
-struct ppcg_options {
-	struct isl_options *isl;
-	struct ppcg_debug_options *debug;
-
-	/* Group chains of consecutive statements before scheduling. */
-	int group_chains;
-
-	/* Use isl to compute a schedule replacing the original schedule. */
-	int reschedule;
-	int scale_tile_loops;
-	int wrap;
-
-	/* Assume all parameters are non-negative. */
-	int non_negative_parameters;
-	char *ctx;
-	char *sizes;
-
-	/* Perform tiling (C target). */
-	int tile;
-	int tile_size;
-
-	/* Isolate full tiles from partial tiles. */
-	int isolate_full_tiles;
-
-	/* Take advantage of private memory. */
-	int use_private_memory;
-
-	/* Take advantage of shared memory. */
-	int use_shared_memory;
-
-	/* Maximal amount of shared memory. */
-	int max_shared_memory;
-
-	/* The target we generate code for. */
-	int target;
-
-	/* Generate OpenMP macros (C target only). */
-	int openmp;
-
-	/* Linearize all device arrays. */
-	int linearize_device_arrays;
-
-	/* Allow the use of GNU extensions in generated code. */
-	int allow_gnu_extensions;
-
-	/* Allow live range to be reordered. */
-	int live_range_reordering;
-
-	/* Allow hybrid tiling whenever a suitable input pattern is found. */
-	int hybrid;
-
-	/* Unroll the code for copying to/from shared memory. */
-	int unroll_copy_shared;
-	/* Unroll code inside tile on GPU targets. */
-	int unroll_gpu_tile;
-
-	/* Options to pass to the OpenCL compiler.  */
-	char *opencl_compiler_options;
-	/* Prefer GPU device over CPU. */
-	int opencl_use_gpu;
-	/* Number of files to include. */
-	int opencl_n_include_file;
-	/* Files to include. */
-	const char **opencl_include_files;
-	/* Print definitions of types in kernels. */
-	int opencl_print_kernel_types;
-	/* Embed OpenCL kernel code in host code. */
-	int opencl_embed_kernel_code;
-
-	/* Name of file for saving isl computed schedule or NULL. */
-	char *save_schedule_file;
-	/* Name of file for loading schedule or NULL. */
-	char *load_schedule_file;
-};
-
-ISL_ARG_DECL(ppcg_debug_options, struct ppcg_debug_options,
-	ppcg_debug_options_args)
-ISL_ARG_DECL(ppcg_options, struct ppcg_options, ppcg_options_args)
-
-#define		PPCG_TARGET_C		0
-#define		PPCG_TARGET_CUDA	1
-#define		PPCG_TARGET_OPENCL      2
-
-void ppcg_options_set_target_defaults(struct ppcg_options *options);
-
-#endif
--- a/polly/lib/External/ppcg/print.c
+++ b/polly/lib/External/ppcg/print.c
@ -1,461 +0,0 @@
-/*
- * Copyright 2012-2013 Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
- */
-
-#include <isl/aff.h>
-#include <isl/ast_build.h>
-#include <isl/id.h>
-
-#include "print.h"
-#include "util.h"
-
-__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "{");
-	p = isl_printer_end_line(p);
-	p = isl_printer_indent(p, 2);
-	return p;
-}
-
-__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p)
-{
-	p = isl_printer_indent(p, -2);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "}");
-	p = isl_printer_end_line(p);
-	return p;
-}
-
-/* Names of notes that keep track of whether min/max
- * macro definitions have already been printed.
- */
-static const char *ppcg_max_printed = "ppcg_max_printed";
-static const char *ppcg_min_printed = "ppcg_min_printed";
-
-/* Has the macro definition corresponding to "note_name" been printed
- * to "p" before?
- * That is, does "p" have an associated "note_name" note?
- */
-static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-	isl_bool printed;
-
-	if (!p)
-		return isl_bool_error;
-
-	ctx = isl_printer_get_ctx(p);
-	id = isl_id_alloc(ctx, note_name, NULL);
-	printed = isl_printer_has_note(p, id);
-	isl_id_free(id);
-
-	return printed;
-}
-
-/* Keep track of the fact that the macro definition corresponding
- * to "note_name" has been printed to "p" by attaching a note with
- * that name.  The value of the note is of no importance, but it
- * has to be a valid isl_id, so the note identifier is reused
- * as the note.
- */
-static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p,
-	const char *note_name)
-{
-	isl_ctx *ctx;
-	isl_id *id;
-
-	if (!p)
-		return NULL;
-
-	ctx = isl_printer_get_ctx(p);
-	id = isl_id_alloc(ctx, note_name, NULL);
-	return isl_printer_set_note(p, id, isl_id_copy(id));
-}
-
-/* Print a macro definition "def" for the macro "name" to "p",
- * unless such a macro definition has been printed to "p" before.
- * "note_name" is used as the name of the note that keeps track
- * of whether this printing has happened.
- */
-static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p,
-	const char *name, const char *def, const char *note_name)
-{
-	isl_bool printed;
-
-	printed = printed_before(p, note_name);
-	if (printed < 0)
-		return isl_printer_free(p);
-	if (printed)
-		return p;
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "#define ");
-	p = isl_printer_print_str(p, name);
-	p = isl_printer_print_str(p, def);
-	p = isl_printer_end_line(p);
-
-	p = mark_printed(p, note_name);
-
-	return p;
-}
-
-/* Structure for keeping track of definitions of some macros.
- */
-struct ppcg_macros {
-	const char *min;
-	const char *max;
-};
-
-/* Free the memory allocated by a struct ppcg_macros.
- */
-static void ppcg_macros_free(void *user)
-{
-	free(user);
-}
-
-/* Default macro definitions (when GNU extensions are allowed).
- */
-struct ppcg_macros ppcg_macros_default = {
-	.min = "(x,y)    "
-		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
-		"_x < _y ? _x : _y; })",
-	.max = "(x,y)    "
-		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
-		"_x > _y ? _x : _y; })",
-};
-
-/* Name used for the note that keeps track of macro definitions.
- */
-static const char *ppcg_macros = "ppcg_macros";
-
-/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max
- * to "min" and "max" and store them in "p".
- *
- * In particular, create a ppcg_macros object and attach it
- * as a note to the printer.
- */
-__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
-	const char *min, const char *max)
-{
-	isl_ctx *ctx;
-	isl_id *id, *macros_id;
-	struct ppcg_macros *macros;
-
-	if (!p)
-		return NULL;
-
-	ctx = isl_printer_get_ctx(p);
-	macros = isl_alloc_type(ctx, struct ppcg_macros);
-	if (!macros)
-		return isl_printer_free(p);
-	macros->min = min;
-	macros->max = max;
-	id = isl_id_alloc(ctx, ppcg_macros, NULL);
-	macros_id = isl_id_alloc(ctx, NULL, macros);
-	if (!macros_id)
-		ppcg_macros_free(macros);
-	else
-		macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free);
-
-	p = isl_printer_set_note(p, id, macros_id);
-
-	return p;
-}
-
-/* Return the ppcg_macros object that holds the currently active
- * macro definitions in "p".
- * If "p" has a note with macro definitions, then return those.
- * Otherwise, return the default macro definitions.
- */
-static struct ppcg_macros *get_macros(__isl_keep isl_printer *p)
-{
-	isl_id *id;
-	isl_bool has_macros;
-	struct ppcg_macros *macros;
-
-	id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL);
-	has_macros = isl_printer_has_note(p, id);
-	if (has_macros < 0 || !has_macros) {
-		isl_id_free(id);
-		if (has_macros < 0)
-			return NULL;
-		return &ppcg_macros_default;
-	}
-	id = isl_printer_get_note(p, id);
-	macros = isl_id_get_user(id);
-	isl_id_free(id);
-
-	return macros;
-}
-
-/* Print the currently active macro definition for ppcg_max.
- */
-static __isl_give isl_printer *print_max(__isl_take isl_printer *p)
-{
-	struct ppcg_macros *macros;
-
-	macros = get_macros(p);
-	if (!macros)
-		return isl_printer_free(p);
-	return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed);
-}
-
-/* Print the currently active macro definition for ppcg_min.
- */
-static __isl_give isl_printer *print_min(__isl_take isl_printer *p)
-{
-	struct ppcg_macros *macros;
-
-	macros = get_macros(p);
-	if (!macros)
-		return isl_printer_free(p);
-	return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed);
-}
-
-/* Print a macro definition for "type" to "p".
- * If GNU extensions are allowed, then print a specialized definition
- * for isl_ast_op_min and isl_ast_op_max.
- * Otherwise, use the default isl definition.
- */
-__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
-	__isl_take isl_printer *p)
-{
-	isl_ctx *ctx;
-	struct ppcg_options *options;
-
-	if (!p)
-		return NULL;
-
-	ctx = isl_printer_get_ctx(p);
-	options = isl_ctx_peek_options(ctx, &ppcg_options_args);
-	if (!options || !options->allow_gnu_extensions)
-		return isl_ast_op_type_print_macro(type, p);
-
-	switch (type) {
-	case isl_ast_op_max:
-		return print_max(p);
-	case isl_ast_op_min:
-		return print_min(p);
-	default:
-		return isl_ast_op_type_print_macro(type, p);
-	}
-}
-
-/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type
- * callback that prints a macro definition for "type".
- */
-static isl_stat print_macro(enum isl_ast_op_type type, void *user)
-{
-	isl_printer **p = user;
-
-	*p = ppcg_print_macro(type, *p);
-	if (!*p)
-		return isl_stat_error;
-
-	return isl_stat_ok;
-}
-
-/* Print the required macros for "expr".
- */
-__isl_give isl_printer *ppcg_ast_expr_print_macros(
-	__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p)
-{
-	if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0)
-		return isl_printer_free(p);
-	return p;
-}
-
-/* isl_id_to_ast_expr_foreach callback that prints the required
- * macro definitions for "val".
- */
-static isl_stat print_expr_macros(__isl_take isl_id *key,
-	__isl_take isl_ast_expr *val, void *user)
-{
-	isl_printer **p = user;
-
-	*p = ppcg_ast_expr_print_macros(val, *p);
-	isl_id_free(key);
-	isl_ast_expr_free(val);
-
-	if (!*p)
-		return isl_stat_error;
-	return isl_stat_ok;
-}
-
-/* Print the required macro definitions for the body of a statement in which
- * the access expressions are replaced by the isl_ast_expr objects
- * in "ref2expr".
- */
-__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
-	__isl_keep isl_id_to_ast_expr *ref2expr)
-{
-	if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0)
-		return isl_printer_free(p);
-	return p;
-}
-
-/* Print the required macros for "node".
- */
-__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node)
-{
-	if (isl_ast_node_foreach_ast_op_type(node, &print_macro, &p) < 0)
-		return isl_printer_free(p);
-	return p;
-}
-
-/* Names used for the macros that may appear in a printed isl AST.
- */
-const char *ppcg_min = "ppcg_min";
-const char *ppcg_max = "ppcg_max";
-const char *ppcg_fdiv_q = "ppcg_fdiv_q";
-
-/* Set the names of the macros that may appear in a printed isl AST.
- */
-__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p)
-{
-	p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min);
-	p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max);
-	p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q);
-
-	return p;
-}
-
-/* Given a multi affine expression "mpa" without domain, modify it to have
- * the schedule space of "build" as domain.
- *
- * If the schedule space of "build" is a parameter space, then nothing
- * needs to be done.
- * Otherwise, "mpa" is first given a 0D domain and then it is combined
- * with a mapping from the schedule space of "build" to the same 0D domain.
- */
-__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff(
-	__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build)
-{
-	isl_bool params;
-	isl_space *space;
-	isl_multi_aff *ma;
-
-	space = isl_ast_build_get_schedule_space(build);
-	params = isl_space_is_params(space);
-	if (params < 0 || params) {
-		isl_space_free(space);
-		if (params < 0)
-			return isl_multi_pw_aff_free(mpa);
-		return mpa;
-	}
-	space = isl_space_from_domain(space);
-	ma = isl_multi_aff_zero(space);
-	mpa = isl_multi_pw_aff_from_range(mpa);
-	mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma);
-
-	return mpa;
-}
-
-/* Build an access AST expression from "size" using "build".
- * "size" does not have a domain, but "build" may have a proper schedule space.
- * First modify "size" to have that schedule space as domain.
- */
-__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
-	__isl_keep isl_ast_build *build)
-{
-	size = ppcg_attach_multi_pw_aff(size, build);
-	return isl_ast_build_access_from_multi_pw_aff(build, size);
-}
-
-/* Print a declaration for an array with element type "base_type" and
- * size "size" to "p".
- */
-__isl_give isl_printer *ppcg_print_declaration_with_size(
-	__isl_take isl_printer *p, const char *base_type,
-	__isl_keep isl_ast_expr *size)
-{
-	if (!base_type || !size)
-		return isl_printer_free(p);
-
-	p = ppcg_ast_expr_print_macros(size, p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, base_type);
-	p = isl_printer_print_str(p, " ");
-	p = isl_printer_print_ast_expr(p, size);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Print a declaration for array "array" to "p", using "build"
- * to simplify any size expressions.
- *
- * The size is computed from the extent of the array and is
- * subsequently converted to an "access expression" by "build".
- */
-__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
-	struct pet_array *array, __isl_keep isl_ast_build *build)
-{
-	isl_multi_pw_aff *size;
-	isl_ast_expr *expr;
-
-	if (!array)
-		return isl_printer_free(p);
-
-	size = ppcg_size_from_extent(isl_set_copy(array->extent));
-	expr = isl_ast_build_access_from_multi_pw_aff(build, size);
-	p = ppcg_print_declaration_with_size(p, array->element_type, expr);
-	isl_ast_expr_free(expr);
-
-	return p;
-}
-
-/* Print declarations for the arrays in "scop" that are declared
- * and that are exposed (if exposed == 1) or not exposed (if exposed == 0).
- */
-static __isl_give isl_printer *print_declarations(__isl_take isl_printer *p,
-	struct ppcg_scop *scop, int exposed)
-{
-	int i;
-	isl_ast_build *build;
-
-	if (!scop)
-		return isl_printer_free(p);
-
-	build = isl_ast_build_from_context(isl_set_copy(scop->context));
-	for (i = 0; i < scop->pet->n_array; ++i) {
-		struct pet_array *array = scop->pet->arrays[i];
-
-		if (!array->declared)
-			continue;
-		if (array->exposed != exposed)
-			continue;
-
-		p = ppcg_print_declaration(p, array, build);
-	}
-	isl_ast_build_free(build);
-
-	return p;
-}
-
-/* Print declarations for the arrays in "scop" that are declared
- * and exposed to the code after the scop.
- */
-__isl_give isl_printer *ppcg_print_exposed_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop)
-{
-	return print_declarations(p, scop, 1);
-}
-
-/* Print declarations for the arrays in "scop" that are declared,
- * but not exposed to the code after the scop.
- */
-__isl_give isl_printer *ppcg_print_hidden_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop)
-{
-	return print_declarations(p, scop, 0);
-}
--- a/polly/lib/External/ppcg/print.h
+++ b/polly/lib/External/ppcg/print.h
@ -1,40 +0,0 @@
-#ifndef PRINT_H
-#define PRINT_H
-
-#include <isl/ast.h>
-
-#include "ppcg.h"
-
-extern const char *ppcg_min;
-extern const char *ppcg_max;
-extern const char *ppcg_fdiv_q;
-
-__isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p);
-
-__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
-	const char *min, const char *max);
-__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
-	__isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_ast_expr_print_macros(
-	__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p);
-__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
-	__isl_keep isl_id_to_ast_expr *ref2expr);
-__isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node);
-
-__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
-	__isl_keep isl_ast_build *build);
-
-__isl_give isl_printer *ppcg_print_declaration_with_size(
-	__isl_take isl_printer *p, const char *base_type,
-	__isl_keep isl_ast_expr *size);
-__isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
-	struct pet_array *array, __isl_keep isl_ast_build *build);
-__isl_give isl_printer *ppcg_print_exposed_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop);
-__isl_give isl_printer *ppcg_print_hidden_declarations(
-	__isl_take isl_printer *p, struct ppcg_scop *scop);
-
-#endif
--- a/polly/lib/External/ppcg/schedule.c
+++ b/polly/lib/External/ppcg/schedule.c
@ -1,165 +0,0 @@
-/*
- * Copyright 2010-2011 INRIA Saclay
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
- * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
- * 91893 Orsay, France
- */
-
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <isl/set.h>
-#include <isl/map.h>
-#include <isl/constraint.h>
-
-#include "schedule.h"
-
-/* Add parameters with identifiers "ids" to "set".
- */
-static __isl_give isl_set *add_params(__isl_take isl_set *set,
-	__isl_keep isl_id_list *ids)
-{
-	int i, n;
-	unsigned nparam;
-
-	n = isl_id_list_n_id(ids);
-
-	nparam = isl_set_dim(set, isl_dim_param);
-	set = isl_set_add_dims(set, isl_dim_param, n);
-
-	for (i = 0; i < n; ++i) {
-		isl_id *id;
-
-		id = isl_id_list_get_id(ids, i);
-		set = isl_set_set_dim_id(set, isl_dim_param, nparam + i, id);
-	}
-
-	return set;
-}
-
-/* Equate the dimensions of "set" starting at "first" to
- * freshly created parameters with identifiers "ids".
- * The number of equated dimensions is equal to the number of elements in "ids".
- */
-static __isl_give isl_set *parametrize(__isl_take isl_set *set,
-	int first, __isl_keep isl_id_list *ids)
-{
-	int i, n;
-	unsigned nparam;
-
-	nparam = isl_set_dim(set, isl_dim_param);
-
-	set = add_params(set, ids);
-
-	n = isl_id_list_n_id(ids);
-	for (i = 0; i < n; ++i)
-		set = isl_set_equate(set, isl_dim_param, nparam + i,
-					isl_dim_set, first + i);
-
-	return set;
-}
-
-/* Given a parameter space "space", create a set of dimension "len"
- * of which the dimensions starting at "first" are equated to
- * freshly created parameters with identifiers "ids".
- */
-__isl_give isl_set *parametrization(__isl_take isl_space *space,
-	int len, int first, __isl_keep isl_id_list *ids)
-{
-	isl_set *set;
-
-	space = isl_space_set_from_params(space);
-	space = isl_space_add_dims(space, isl_dim_set, len);
-	set = isl_set_universe(space);
-
-	return parametrize(set, first, ids);
-}
-
-/* Load and return a schedule from a file called "filename".
- */
-static __isl_give isl_schedule *load_schedule(isl_ctx *ctx,
-	const char *filename)
-{
-	FILE *file;
-	isl_schedule *schedule;
-
-	file = fopen(filename, "r");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for reading\n", filename);
-		return NULL;
-	}
-	schedule = isl_schedule_read_from_file(ctx, file);
-	fclose(file);
-
-	return schedule;
-}
-
-/* Save the schedule "schedule" to a file called "filename".
- * The schedule is printed in block style.
- */
-static void save_schedule(__isl_keep isl_schedule *schedule,
-	const char *filename)
-{
-	FILE *file;
-	isl_ctx *ctx;
-	isl_printer *p;
-
-	if (!schedule)
-		return;
-
-	file = fopen(filename, "w");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for writing\n", filename);
-		return;
-	}
-	ctx = isl_schedule_get_ctx(schedule);
-	p = isl_printer_to_file(ctx, file);
-	p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
-	p = isl_printer_print_schedule(p, schedule);
-	isl_printer_free(p);
-	fclose(file);
-}
-
-/* Obtain a schedule, either by reading it form a file
- * or by computing it using "compute".
- * Also take care of saving the computed schedule and/or
- * dumping the obtained schedule if requested by the user.
- */
-__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
-	struct ppcg_options *options,
-	__isl_give isl_schedule *(*compute)(void *user), void *user)
-{
-	isl_schedule *schedule;
-
-	if (options->load_schedule_file) {
-		schedule = load_schedule(ctx, options->load_schedule_file);
-	} else {
-		schedule = compute(user);
-		if (options->save_schedule_file)
-			save_schedule(schedule, options->save_schedule_file);
-	}
-	if (options->debug->dump_schedule)
-		isl_schedule_dump(schedule);
-
-	return schedule;
-}
-
-/* Mark all dimensions in the band node "node" to be of "type".
- */
-__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
-	__isl_take isl_schedule_node *node, enum isl_ast_loop_type type)
-{
-	int i, n;
-
-	n = isl_schedule_node_band_n_member(node);
-	for (i = 0; i < n; ++i)
-		node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
-							type);
-
-	return node;
-}
--- a/polly/lib/External/ppcg/schedule.h
+++ b/polly/lib/External/ppcg/schedule.h
@ -1,21 +0,0 @@
-#ifndef _SCHEDULE_H
-#define _SCHEDULE_H
-
-#include <isl/id.h>
-#include <isl/space.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-
-#include "ppcg_options.h"
-
-__isl_give isl_set *parametrization(__isl_take isl_space *space,
-	int len, int first, __isl_keep isl_id_list *names);
-
-__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
-	struct ppcg_options *options,
-	__isl_give isl_schedule *(*compute)(void *user), void *user);
-
-__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
-	__isl_take isl_schedule_node *node, enum isl_ast_loop_type type);
-
-#endif
--- a/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c
+++ b/polly/lib/External/ppcg/tests/allow-sparse-copy-in.c
@ -1,49 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int A[2][1000][1000];
-	int B[2][1000][1000];
-
-#pragma scop
-	{
-		for (int i = 0; i < 256; ++i)
-			for (int j = 0; j < 256; ++j)
-				if (j % 8 <= 2 || j % 8 >= 6)
-					A[1][i][j] = B[1][j][i];
-	}
-#pragma endscop
-
-/* 
-
-When compiled with:
-
-./ppcg tests/allow-sparse-copy-in.c --no-linearize-device-arrays
-	--on-error=abort --sizes='{kernel[i]->tile[8,8]; kernel[i]->block[1,8]}'
-	--max-shared-memory=-1  --unroll-copy-shared
-
-this originally resulted in the following copy-in code:
-
-      shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
-      shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
-      shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
-      shared_B[0][3][t1] = B[1][8 * b1 + 3][8 * b0 + t1];
-      shared_B[0][4][t1] = B[1][8 * b1 + 4][8 * b0 + t1];
-      shared_B[0][5][t1] = B[1][8 * b1 + 5][8 * b0 + t1];
-      shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
-      shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
-
-whereas we only want to only perform copies that are actually needed:
-
-      shared_B[0][0][t1] = B[1][8 * b1][8 * b0 + t1];
-      shared_B[0][1][t1] = B[1][8 * b1 + 1][8 * b0 + t1];
-      shared_B[0][2][t1] = B[1][8 * b1 + 2][8 * b0 + t1];
-      shared_B[0][6][t1] = B[1][8 * b1 + 6][8 * b0 + t1];
-      shared_B[0][7][t1] = B[1][8 * b1 + 7][8 * b0 + t1];
-*/
-	for (int i = 0; i < 100; ++i)
-		if (A[1][0][i] != i)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/call.c
+++ b/polly/lib/External/ppcg/tests/call.c
@ -1,29 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[1000], int a[1000], int pos)
-{
-	b[pos] = 0;
-	int c = a[pos];
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[1000], int a[1000], int pos);
-
-int main()
-{
-	int a[1000], b[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		copy(b, a, i);
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/call2.c
+++ b/polly/lib/External/ppcg/tests/call2.c
@ -1,29 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[1000], int a[1000], int pos)
-{
-	b[pos] = 0;
-	int c = a[pos];
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[1000], int a[1000], int pos);
-
-int main()
-{
-	int a[2][1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[0][i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		copy(a[1], a[0], i);
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (a[1][i] != a[0][i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/call2_opencl_functions.cl
+++ b/polly/lib/External/ppcg/tests/call2_opencl_functions.cl
@ -1,4 +0,0 @@
-void copy(__global int b[1000], __global int a[1000], int pos)
-{
-	b[pos] = a[pos];
-}
--- a/polly/lib/External/ppcg/tests/call3.c
+++ b/polly/lib/External/ppcg/tests/call3.c
@ -1,32 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[100], int a[100])
-{
-	for (int i = 0; i < 100; ++i) {
-		b[i] = 0;
-		int c = a[i];
-	}
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[100], int a[100]);
-
-int main()
-{
-	int A[100][100], B[100];
-
-	for (int i = 0; i < 100; ++i)
-		B[i] = i;
-#pragma scop
-	for (int i = 0; i < 100; ++i)
-		copy(A[i], B);
-#pragma endscop
-	for (int i = 0; i < 100; ++i)
-		for (int j = 0; j < 100; ++j)
-			if (A[j][i] != B[i])
-				return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/call3_opencl_functions.cl
+++ b/polly/lib/External/ppcg/tests/call3_opencl_functions.cl
@ -1,5 +0,0 @@
-void copy(__global int b[100], __global int a[100])
-{
-	for (int i = 0; i < 100; ++i)
-		b[i] = a[i];
-}
--- a/polly/lib/External/ppcg/tests/call_opencl_functions.cl
+++ b/polly/lib/External/ppcg/tests/call_opencl_functions.cl
@ -1,4 +0,0 @@
-void copy(__global int b[1000], __global int a[1000], int pos)
-{
-	b[pos] = a[pos];
-}
--- a/polly/lib/External/ppcg/tests/dead.c
+++ b/polly/lib/External/ppcg/tests/dead.c
@ -1,23 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int a[1000], b[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i) {
-		int c;
-		int d;
-		c = a[i];
-		d = c;
-		b[i] = c;
-	}
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/iterator.c
+++ b/polly/lib/External/ppcg/tests/iterator.c
@ -1,18 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int i;
-	int a[101];
-
-	i = 0;
-#pragma scop
-	for (i = 0; i < 100; ++i)
-		a[i] = i;
-	a[i] = i;
-#pragma endscop
-	if (a[100] != 100)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/live_out.c
+++ b/polly/lib/External/ppcg/tests/live_out.c
@ -1,22 +0,0 @@
-#include <stdlib.h>
-
-/* Check that a write access is not removed from the live-out
- * accesses only because a strict subset of the (potentially)
- * accessed elements are killed by a later write.
- */
-int main()
-{
-	int A[10];
-
-	A[1] = 0;
-#pragma scop
-	int i = 1;
-	i = i * i;
-	A[i] = 1;
-	A[0] = 0;
-#pragma endscop
-	if (A[1] != 1)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/local.c
+++ b/polly/lib/External/ppcg/tests/local.c
@ -1,22 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int A[100];
-
-#pragma scop
-	{
-		int B[100];
-		B[0] = 0;
-		for (int i = 1; i < 100; ++i)
-			B[i] = B[i - 1] + 1;
-		for (int i = 0; i < 100; ++i)
-			A[i] = B[i];
-	}
-#pragma endscop
-	for (int i = 0; i < 100; ++i)
-		if (A[i] != i)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/loop.c
+++ b/polly/lib/External/ppcg/tests/loop.c
@ -1,18 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int a[1000], b[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		b[i] = a[i];
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/not_accessed.c
+++ b/polly/lib/External/ppcg/tests/not_accessed.c
@ -1,29 +0,0 @@
-#include <stdlib.h>
-
-void copy_summary(int b[1000], int a[1000], int pos, int c[1000])
-{
-	b[pos] = 0;
-	int d = a[pos];
-}
-
-#ifdef pencil_access
-__attribute__((pencil_access(copy_summary)))
-#endif
-void copy(int b[1000], int a[1000], int pos, int c[1000]);
-
-int main()
-{
-	int a[1000], b[1000], c[1000];
-
-	for (int i = 0; i < 1000; ++i)
-		a[i] = i;
-#pragma scop
-	for (int i = 0; i < 1000; ++i)
-		copy(b, a, i, c);
-#pragma endscop
-	for (int i = 0; i < 1000; ++i)
-		if (b[i] != a[i])
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl
+++ b/polly/lib/External/ppcg/tests/not_accessed_opencl_functions.cl
@ -1,5 +0,0 @@
-void copy(__global int b[1000], __global int a[1000], int pos,
-	__global int c[1000])
-{
-	b[pos] = a[pos];
-}
--- a/polly/lib/External/ppcg/tests/scalar.c
+++ b/polly/lib/External/ppcg/tests/scalar.c
@ -1,13 +0,0 @@
-#include <stdlib.h>
-
-int main()
-{
-	int a;
-#pragma scop
-	a = 1;
-#pragma endscop
-	if (a != 1)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/shared_sink.c
+++ b/polly/lib/External/ppcg/tests/shared_sink.c
@ -1,25 +0,0 @@
-#include <stdlib.h>
-
-/* Check that the sources of live ranges with the same sink
- * are executed in order.
- */
-int main()
-{
-	int A[128];
-	int n = 128;
-
-	A[0] = 0;
-#pragma scop
-	for (int i = 0; i < n; ++i) {
-		int set = 0;
-		if (A[i] < 2)
-			set = 1;
-		if (set)
-			A[i] = 2;
-	}
-#pragma endscop
-	if (A[0] != 2)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/struct.c
+++ b/polly/lib/External/ppcg/tests/struct.c
@ -1,31 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int c[10][10];
-};
-
-int main()
-{
-	struct s a[10][10], b[10][10];
-
-	for (int i = 0; i < 10; ++i)
-		for (int j = 0; j < 10; ++j)
-			for (int k = 0; k < 10; ++k)
-				for (int l = 0; l < 10; ++l)
-					a[i][j].c[k][l] = i + j + k + l;
-#pragma scop
-	for (int i = 0; i < 10; ++i)
-		for (int j = 0; j < 10; ++j)
-			for (int k = 0; k < 10; ++k)
-				for (int l = 0; l < 10; ++l)
-					b[i][j].c[k][l] = i + j + k + l;
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		for (int j = 0; j < 10; ++j)
-			for (int k = 0; k < 10; ++k)
-				for (int l = 0; l < 10; ++l)
-					if (b[i][j].c[k][l] != a[i][j].c[k][l])
-						return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/struct2.c
+++ b/polly/lib/External/ppcg/tests/struct2.c
@ -1,21 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int a;
-};
-
-int main()
-{
-	struct s a, b[10];
-
-#pragma scop
-	a.a = 42;
-	for (int i = 0; i < 10; ++i)
-		b[i].a = a.a;
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		if (b[i].a != 42)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/struct3.c
+++ b/polly/lib/External/ppcg/tests/struct3.c
@ -1,25 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int a;
-	int b;
-};
-
-int main()
-{
-	struct s a, b[10];
-
-	a.b = 57;
-#pragma scop
-	a.a = 42;
-	for (int i = 0; i < 10; ++i)
-		b[i] = a;
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		if (b[i].a != 42)
-			return EXIT_FAILURE;
-	if (a.b != 57)
-		return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/tests/struct4.c
+++ b/polly/lib/External/ppcg/tests/struct4.c
@ -1,27 +0,0 @@
-#include <stdlib.h>
-
-struct s {
-	int a;
-	int b;
-};
-
-int main()
-{
-	int a[10];
-
-	for (int i = 0; i < 10; ++i)
-		a[i] = 0;
-#pragma scop
-	for (int i = 0; i < 10; ++i) {
-		struct s b;
-		b.a = 1;
-		b.b = i;
-		a[i] = b.a + b.b;
-	}
-#pragma endscop
-	for (int i = 0; i < 10; ++i)
-		if (a[i] != 1 + i)
-			return EXIT_FAILURE;
-
-	return EXIT_SUCCESS;
-}
--- a/polly/lib/External/ppcg/util.c
+++ b/polly/lib/External/ppcg/util.c
@ -1,105 +0,0 @@
-/*
- * Copyright 2012-2013 Ecole Normale Superieure
- *
- * Use of this software is governed by the MIT license
- *
- * Written by Sven Verdoolaege,
- * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
- */
-
-#include <isl/space.h>
-#include <isl/val.h>
-#include <isl/aff.h>
-#include <isl/set.h>
-
-#include "util.h"
-
-/* Construct an isl_multi_val living in "space" with all values equal to "val".
- */
-__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
-	int val)
-{
-	int i, n;
-	isl_ctx *ctx;
-	isl_val *v;
-	isl_multi_val *mv;
-
-	if (!space)
-		return NULL;
-
-	ctx = isl_space_get_ctx(space);
-	n = isl_space_dim(space, isl_dim_set);
-	mv = isl_multi_val_zero(space);
-	v = isl_val_int_from_si(ctx, val);
-	for (i = 0; i < n; ++i)
-		mv = isl_multi_val_set_val(mv, i, isl_val_copy(v));
-	isl_val_free(v);
-
-	return mv;
-}
-
-/* Construct an isl_multi_val living in "space" with values specified
- * by "list".  "list" is assumed to have at least as many entries
- * as the set dimension of "space".
- */
-__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
-	__isl_take isl_space *space, int *list)
-{
-	int i, n;
-	isl_ctx *ctx;
-	isl_multi_val *mv;
-
-	if (!space)
-		return NULL;
-
-	ctx = isl_space_get_ctx(space);
-	n = isl_space_dim(space, isl_dim_set);
-	mv = isl_multi_val_zero(space);
-	for (i = 0; i < n; ++i) {
-		isl_val *v;
-
-		v = isl_val_int_from_si(ctx, list[i]);
-		mv = isl_multi_val_set_val(mv, i, v);
-	}
-
-	return mv;
-}
-
-/* Compute the size of a bounding box around the origin and "set",
- * where "set" is assumed to contain only non-negative elements.
- * In particular, compute the maximal value of "set" in each direction
- * and add one.
- */
-__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set)
-{
-	int i, n;
-	isl_multi_pw_aff *mpa;
-
-	n = isl_set_dim(set, isl_dim_set);
-	mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
-	for (i = 0; i < n; ++i) {
-		isl_space *space;
-		isl_aff *one;
-		isl_pw_aff *bound;
-
-		if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) {
-			const char *name;
-			name = isl_set_get_tuple_name(set);
-			if (!name)
-				name = "";
-			fprintf(stderr, "unable to determine extent of '%s' "
-				"in dimension %d\n", name, i);
-			set = isl_set_free(set);
-		}
-		bound = isl_set_dim_max(isl_set_copy(set), i);
-
-		space = isl_pw_aff_get_domain_space(bound);
-		one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
-		one = isl_aff_add_constant_si(one, 1);
-		bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
-		mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
-	}
-	isl_set_free(set);
-
-	return mpa;
-}
--- a/polly/lib/External/ppcg/util.h
+++ b/polly/lib/External/ppcg/util.h
@ -1,22 +0,0 @@
-#ifndef UTIL_H
-#define UTIL_H
-
-#include <string.h>
-
-#include <isl/space.h>
-#include <isl/val.h>
-
-/* Compare the prefix of "s" to "prefix" up to the length of "prefix".
- */
-static inline int prefixcmp(const char *s, const char *prefix)
-{
-	return strncmp(s, prefix, strlen(prefix));
-}
-
-__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
-	int val);
-__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
-	__isl_take isl_space *space, int *list);
-__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set);
-
-#endif
--- a/polly/lib/External/ppcg/version.c
+++ b/polly/lib/External/ppcg/version.c
@ -1,6 +0,0 @@
-#include "gitversion.h"
-
-const char *ppcg_version(void)
-{
-	return GIT_HEAD_ID"\n";
-}
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@ -217,14 +217,6 @@ static StaticInitializer InitializeEverything;
 void initializePollyPasses(llvm::PassRegistry &Registry) {
  initializeCodeGenerationPass(Registry);

-#ifdef GPU_CODEGEN
-  initializePPCGCodeGenerationPass(Registry);
-  initializeManagedMemoryRewritePassPass(Registry);
-  LLVMInitializeNVPTXTarget();
-  LLVMInitializeNVPTXTargetInfo();
-  LLVMInitializeNVPTXTargetMC();
-  LLVMInitializeNVPTXAsmPrinter();
-#endif
  initializeCodePreparationPass(Registry);
  initializeDeadCodeElimWrapperPassPass(Registry);
  initializeDependenceInfoPass(Registry);
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@ -711,11 +711,6 @@ static void runIslScheduleOptimizer(
    function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
    TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
    isl::schedule &LastSchedule, bool &DepsChanged) {
-
-  // Skip SCoPs in case they're already optimised by PPCGCodeGeneration
-  if (S.isToBeSkipped())
-    return;
-
  // Skip empty SCoPs but still allow code generation as it will delete the
  // loops present but not needed.
  if (S.getSize() == 0) {
--- a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
+++ b/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
@ -1,9 +0,0 @@
-define float @__nv_expf(float %a) {
-  ret float %a
-}
-define float @__nv_cosf(float %a) {
-  ret float %a
-}
-define float @__nv_logf(float %a) {
-  ret float %a
-}
--- a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
+++ b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
@ -1,71 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop.
-; SCOP:       Function: checkScalarKill
-; SCOP-NEXT: Region: %XLoopInit---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; Check that we have a scalar that is not a phi node in the scop.
-; SCOP: i32 MemRef_x_0; // Element size 4
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; Check that we add variables that are local to a scop into the kills that we
-; pass to PPCG. This should enable PPCG to codegen this example.
-; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
-; int x;
-; #pragma scop
-;     for(int i = 0; i < 1000; i++) {
-; XLoopInit:        x = 0;
-;
-;         if (control1 > 2)
-;             C1Add: x += 10;
-;         if (control2 > 3)
-;             C2Add: x += A[i];
-;
-; BLoopAccumX:        B[i] += x;
-;     }
-;
-; #pragma endscop
-; }
-; ModuleID = 'test.ll'
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %XLoopInit
-
-XLoopInit:                                        ; preds = %entry.split, %BLoopAccumX
-  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
-  %cmp1 = icmp sgt i32 %control1, 2
-  %x.0 = select i1 %cmp1, i32 10, i32 0
-  %cmp2 = icmp sgt i32 %control2, 3
-  br i1 %cmp2, label %C2Add, label %BLoopAccumX
-
-C2Add:                                            ; preds = %XLoopInit
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %tmp6 = load i32, ptr %arrayidx, align 4
-  %add4 = add nsw i32 %tmp6, %x.0
-  br label %BLoopAccumX
-
-BLoopAccumX:                                      ; preds = %XLoopInit, %C2Add
-  %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
-  %arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
-  %tmp11 = load i32, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %tmp11, %x.1
-  store i32 %add8, ptr %arrayidx7, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %XLoopInit, label %for.end
-
-for.end:                                          ; preds = %BLoopAccumX
-  ret void
-}
--- a/polly/test/GPGPU/align-params-in-schedule.ll
+++ b/polly/test/GPGPU/align-params-in-schedule.ll
@ -1,53 +0,0 @@
-; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-; CHECK: polly_launchKernel
-
-; Verify that this program compiles. At some point, this compilation crashed
-; due to insufficient parameters being available.
-
-source_filename = "bugpoint-output-4d01492.bc"
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] }
-%struct.widget = type { i64, i64, i64 }
-
-@global = external unnamed_addr global %struct.barney, align 32
-
-; Function Attrs: nounwind uwtable
-define void @wobble(ptr noalias %arg) #0 {
-bb:
-  %tmp = load i32, ptr %arg, align 4
-  br label %bb1
-
-bb1:                                              ; preds = %bb13, %bb
-  %tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ]
-  br label %bb3
-
-bb3:                                              ; preds = %bb3, %bb1
-  %tmp4 = load ptr, ptr @global, align 32
-  %tmp5 = sext i32 %tmp2 to i64
-  %tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8
-  %tmp7 = mul i64 %tmp6, %tmp5
-  %tmp8 = add i64 %tmp7, 0
-  %tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8
-  %tmp10 = add i64 %tmp8, %tmp9
-  %tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10
-  store i32 undef, ptr %tmp11, align 4
-  %tmp12 = icmp eq i32 0, 0
-  br i1 %tmp12, label %bb13, label %bb3
-
-bb13:                                             ; preds = %bb3
-  %tmp14 = icmp eq i32 %tmp2, %tmp
-  %tmp15 = add i32 %tmp2, 1
-  br i1 %tmp14, label %bb16, label %bb1
-
-bb16:                                             ; preds = %bb13
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
--- a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
+++ b/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
@ -1,50 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-use-llvm-names < %s
-; ModuleID = 'test/GPGPU/zero-size-array.ll'
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-
-; We used to divide the element size by 8 to arrive at the 'actual' size
-; of an array element. This used to cause arrays that have an element size
-; of less than 8 to collapse to size 0. This test makes sure that it does
-; not happen anymore.
-
-; f(int *niters_ptr, int *arr[0]) {
-;     const int inters = *niters_ptr;
-;     for(int i = 0; i < niters; i++) {
-;       arr[0][i + 1] = 0
-;     }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 {
-entry:
-  %niters = load i32, ptr %niters.ptr, align 4
-  br label %loop.body
-
-loop.body:                                             ; preds = %loop.body, %entry
-  %indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ]
-  %indvar.sext = sext i32 %indvar to i64
-  %arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext
-  store i32 0, ptr %arr.slot, align 4
-  %tmp8 = icmp eq i32 %indvar, %niters
-  %indvar.next = add i32 %indvar, 1
-  br i1 %tmp8, label %loop.exit, label %loop.body
-
-loop.exit:                                    ; preds = %loop.body
-  %tmp10 = icmp sgt i32 undef, 0
-  br label %auxiliary.loop
-
-auxiliary.loop:                                            ; preds = %"101", %loop.exit
-  %tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ]
-  br i1 undef, label %auxiliary.loop, label %exit
-
-exit:                              ; preds = %auxiliary.loop
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
--- a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
+++ b/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
@ -1,55 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-ignore-parameter-bounds \
-; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR
-;
-; REQUIRES: pollyacc
-
-; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain
-; all the parameters present in the program.
-;
-; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff`
-; to have the same parameter dimensions. To achieve this, we used to realign
-; every `pw_aff` with `Scop::Context`. However, in conjunction with
-; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context`
-; does not contain all parameters.
-;
-; We check that Polly does the right thing in this case and sets up the parameter
-; dimensions correctly.
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
-
-; C pseudocode
-; ------------
-; void f(int *arr, long niters, long stride) {
-;     for(int i = 0; i < niters; i++) {
-;       arr[i * stride] = 1;
-;     }
-; }
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 {
-entry:
-  br label %loop
-
-loop:                                             ; preds = %loop, %entry
-  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
-  %idx = mul nuw nsw i64 %indvar, %stride
-  %slot = getelementptr i32, ptr %arr, i64 %idx
-  store i32 1, ptr %slot, align 4
-  %indvar.next = add nuw nsw i64 %indvar, 1
-  %check = icmp sgt i64 %indvar.next, %niters
-  br i1 %check, label %exit, label %loop
-
-exit:                                             ; preds = %loop
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind uwtable }
--- a/polly/test/GPGPU/cuda-annotations.ll
+++ b/polly/test/GPGPU/cuda-annotations.ll
@ -1,37 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 {
-
-; KERNEL: !nvvm.annotations = !{!0}
-
-; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, i64 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb8
-
-bb2:                                              ; preds = %bb1
-  %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp4 = load i64, ptr %tmp3, align 8
-  %tmp5 = add nsw i64 %tmp4, 100
-  store i64 %tmp5, ptr %tmp3, align 8
-  br label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb8:                                              ; preds = %bb1
-  ret void
-}
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ b/polly/test/GPGPU/cuda-managed-memory-simple.ll
@ -1,118 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-;
-;    #include <cuda_runtime.h>
-;
-;    static const int N = 45;
-;
-;    void copy(int *R, int *A) {
-;      for (int i = 0; i < N; i++) {
-;        R[i] = A[i] * 10;
-;      }
-;    }
-;
-;    int main() {
-;      int *A, *R;
-;
-;      cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
-;      cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
-;
-;      for (int i = 0; i < N; i++) {
-;        A[i] = i;
-;        R[i] = 0;
-;      }
-;      copy(R, A);
-;
-;      return 0;
-;    }
-;
-
-; CHECK-NOT: polly_copyFromHostToDevice
-; CHECK-NOT: polly_copyFromDeviceToHost
-; CHECK-NOT: polly_freeDeviceMemory
-; CHECK-NOT: polly_allocateMemoryForDevice
-
-; CHECK:       %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
-; CHECK-NEXT:  %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
-; CHECK-NEXT:  %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
-; CHECK-NEXT:  %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
-; CHECK-NEXT:  store i8* %[[REGCA]], i8** %polly_launch_0_param_0
-; CHECK-NEXT:  %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
-; CHECK-NEXT:  store i8* %[[REGCP0]], i8** %[[REGGEP0]]
-; CHECK-NEXT:  %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; CHECK-NEXT:  store i8* %[[REGCR]], i8** %polly_launch_0_param_1
-; CHECK-NEXT:  %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
-; CHECK-NEXT:  store i8* %[[REGCP1]], i8** %[[REGGEP1]]
-; CHECK-NEXT:  %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
-; CHECK-NEXT:  call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
-; CHECK-NEXT:  call void @polly_freeKernel(i8* %[[REGKERNEL]])
-; CHECK-NEXT:  call void @polly_synchronizeDevice()
-; CHECK-NEXT:  call void @polly_freeContext(i8* %[[REGCTX]])
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @copy(i32* %R, i32* %A) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  %exitcond = icmp ne i64 %indvars.iv, 45
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %tmp = load i32, i32* %arrayidx, align 4
-  %mul = mul nsw i32 %tmp, 10
-  %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
-  store i32 %mul, i32* %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-
-define i32 @main() {
-entry:
-  %A = alloca i32*, align 8
-  %R = alloca i32*, align 8
-  %tmp = bitcast i32** %A to i8**
-  %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
-  %tmp1 = bitcast i32** %R to i8**
-  %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  %exitcond = icmp ne i64 %indvars.iv, 45
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %tmp2 = load i32*, i32** %A, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
-  %tmp3 = trunc i64 %indvars.iv to i32
-  store i32 %tmp3, i32* %arrayidx, align 4
-  %tmp4 = load i32*, i32** %R, align 8
-  %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
-  store i32 0, i32* %arrayidx3, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %tmp5 = load i32*, i32** %R, align 8
-  %tmp6 = load i32*, i32** %A, align 8
-  call void @copy(i32* %tmp5, i32* %tmp6)
-  ret i32 0
-}
-
-declare i32 @cudaMallocManaged(i8**, i64, i32) #1
--- a/polly/test/GPGPU/debug-metadata-leak.ll
+++ b/polly/test/GPGPU/debug-metadata-leak.ll
@ -1,104 +0,0 @@
-; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: | FileCheck --check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 {
-
-; The instruction marked <<<LeakyInst>>> is copied into the GPUModule,
-; with changes only to the parameters to access data on the device instead of
-; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the
-; instruction is annotated with a DILocation, copying the instruction also copies
-; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by
-; failing the verification of the Module in GPUNodeBuilder::finalize, due to the
-; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied
-; nor created.
-;
-; https://reviews.llvm.org/D35630 removes this debug metadata before the
-; instruction is copied to the GPUModule.
-;
-; vec_add_1.c:
-;      void vec_add_1(int N, int arr[N]) {
-;        int i=0;
-;        for( i=0 ; i<N ; i++) arr[i] += 1;
-;      }
-;
-source_filename = "vec_add_1.c"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @vec_add_1(i32 %N, ptr %arr) !dbg !7 {
-entry:
-  call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17
-  call void @llvm.dbg.value(metadata ptr %arr, i64 0, metadata !14, metadata !16), !dbg !18
-  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
-  %tmp = sext i32 %N to i64, !dbg !20
-  br label %for.cond, !dbg !20
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19
-  %cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22
-  br i1 %cmp, label %for.body, label %for.end, !dbg !24
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv, !dbg !25
-  %tmp1 = load i32, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
-  %add = add nsw i32 %tmp1, 1, !dbg !26    ;   <<<LeakyInst>>>
-  store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
-  br label %for.inc, !dbg !25
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31
-  call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19
-  br label %for.cond, !dbg !32, !llvm.loop !33
-
-for.end:                                          ; preds = %for.cond
-  ret void, !dbg !35
-}
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata)
-
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
-
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 5.0.0"}
-!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null, !10, !11}
-!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
-!12 = !{!13, !14, !15}
-!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10)
-!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11)
-!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10)
-!16 = !DIExpression()
-!17 = !DILocation(line: 1, column: 20, scope: !7)
-!18 = !DILocation(line: 1, column: 27, scope: !7)
-!19 = !DILocation(line: 2, column: 7, scope: !7)
-!20 = !DILocation(line: 3, column: 8, scope: !21)
-!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
-!22 = !DILocation(line: 3, column: 15, scope: !23)
-!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3)
-!24 = !DILocation(line: 3, column: 3, scope: !21)
-!25 = !DILocation(line: 3, column: 25, scope: !23)
-!26 = !DILocation(line: 3, column: 32, scope: !23)
-!27 = !{!28, !28, i64 0}
-!28 = !{!"int", !29, i64 0}
-!29 = !{!"omnipotent char", !30, i64 0}
-!30 = !{!"Simple C/C++ TBAA"}
-!31 = !DILocation(line: 3, column: 21, scope: !23)
-!32 = !DILocation(line: 3, column: 3, scope: !23)
-!33 = distinct !{!33, !24, !34}
-!34 = !DILocation(line: 3, column: 35, scope: !21)
-!35 = !DILocation(line: 4, column: 1, scope: !7)
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@ -1,254 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=SCHED %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-ASM
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today due to extensive output differences from when the test was written.
-
-; CHECK: Stmt_bb5
-; CHECK-NEXT:       Domain :=
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
-; CHECK-NEXT:       Schedule :=
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> [i0, i1] };
-; CHECK-NEXT:       ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
-; CHECK-NEXT:       MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
-
-; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
-; SCHED-NEXT: child:
-; SCHED-NEXT:   context: "{ [] }"
-; SCHED-NEXT:   child:
-; SCHED-NEXT:     extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }"
-; SCHED-NEXT:     child:
-; SCHED-NEXT:       sequence:
-; SCHED-NEXT:       - filter: "{ to_device_MemRef_A[] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           set:
-; SCHED-NEXT:           - filter: "{ to_device_MemRef_A[] }"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               guard: "{ [] }"
-; SCHED-NEXT:       - filter: "{ Stmt_bb5[i0, i1] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           guard: "{ [] }"
-; SCHED-NEXT:           child:
-; SCHED-NEXT:             mark: "kernel"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED-NEXT:               child:
-; SCHED-NEXT:                 filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
-; SCHED-NEXT:                 child:
-; SCHED-NEXT:                   schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
-; SCHED-NEXT:                   permutable: 1
-; SCHED-NEXT:                   coincident: [ 1, 1 ]
-; SCHED-NEXT:                   child:
-; SCHED-NEXT:                     filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED-NEXT:                     child:
-; SCHED-NEXT:                       schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
-; SCHED-NEXT:                       permutable: 1
-; SCHED-NEXT:                       coincident: [ 1, 1 ]
-; SCHED-NEXT:       - filter: "{ from_device_MemRef_A[] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           set:
-; SCHED-NEXT:           - filter: "{ from_device_MemRef_A[] }"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               guard: "{ [] }"
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(16, 32);
-; CODE-NEXT:     dim3 k0_dimGrid(32, 32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
-; CODE-NEXT:   Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
-
-; IR: polly.split_new_and_old:
-; IR-NEXT:   %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
-; IR-NEXT:   %.obit = extractvalue { i64, i1 } %0, 1
-; IR-NEXT:   %polly.overflow.state = or i1 false, %.obit
-; IR-NEXT:   %.res = extractvalue { i64, i1 } %0, 0
-; IR-NEXT:   %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
-; IR-NEXT:   %.obit1 = extractvalue { i64, i1 } %1, 1
-; IR-NEXT:   %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
-; IR-NEXT:   %.res3 = extractvalue { i64, i1 } %1, 0
-; IR-NEXT:   %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
-; IR-NEXT:   %.obit4 = extractvalue { i64, i1 } %2, 1
-; IR-NEXT:   %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
-; IR-NEXT:   %.res6 = extractvalue { i64, i1 } %2, 0
-; IR-NEXT:   %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
-; IR-NEXT:   %.obit7 = extractvalue { i64, i1 } %3, 1
-; IR-NEXT:   %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
-; IR-NEXT:   %.res9 = extractvalue { i64, i1 } %3, 0
-; IR-NEXT:   %4 = icmp sge i64 %.res9, 2621440
-; IR-NEXT:   %5 = and i1 true, %4
-; IR-NEXT:   %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
-; IR-NEXT:   %polly.rtc.result = and i1 %5, %polly.rtc.overflown
-; IR-NEXT:   br i1 %polly.rtc.result, label %polly.start, label %bb2
-
-; IR: polly.start:
-; IR-NEXT: br label %polly.acc.initialize
-
-; IR: polly.acc.initialize:
-; IR-NEXT:    [[GPUContext:%.*]] = call ptr @polly_initContext()
-; IR-NEXT:    %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304)
-; IR-NEXT:    call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304)
-; IR-NEXT:    [[DevPtr:%.*]]  = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A)
-; IR-NEXT:    store ptr [[DevPtr]], ptr %polly_launch_0_param_0
-; IR-NEXT:    store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params
-; IR-NEXT:    call ptr @polly_getKernel
-; IR-NEXT:    call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr)
-; IR-NEXT:    call void @polly_freeKernel
-; IR-NEXT:    call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304)
-; IR-NEXT:    call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A)
-; IR-NEXT:    call void @polly_freeContext(ptr [[GPUContext]])
-; IR-NEXT:    br label %polly.exiting
-
-; IR: polly.exiting:
-; IR-NEXT:    br label %polly.merge_new_and_old
-
-; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 {
-; KERNEL-IR-NEXT: entry:
-; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
-; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-; KERNEL-IR-NEXT:   %b1 = zext i32 %1 to i64
-; KERNEL-IR-NEXT:   %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-IR-NEXT:   %t0 = zext i32 %2 to i64
-; KERNEL-IR-NEXT:   %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-; KERNEL-IR-NEXT:   %t1 = zext i32 %3 to i64
-; KERNEL-IR-NEXT:   br label %polly.loop_preheader
-
-; KERNEL-IR-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
-; KERNEL-IR-NEXT:   ret void
-
-; KERNEL-IR-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
-; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
-; KERNEL-IR-NEXT:   %4 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %5 = add nsw i64 %4, %t0
-; KERNEL-IR-NEXT:   %6 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %7 = add nsw i64 %6, %t1
-; KERNEL-IR-NEXT:   %8 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %9 = add nsw i64 %7, %8
-; KERNEL-IR-NEXT:   br label %polly.stmt.bb5
-
-; KERNEL-IR-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
-; KERNEL-IR-NEXT:   %10 = mul i64 %5, %9
-; KERNEL-IR-NEXT:   %p_tmp6 = sitofp i64 %10 to float
-; KERNEL-IR-NEXT:   %11 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, %t0
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
-; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t1
-; KERNEL-IR-NEXT:   %15 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %16 = add nsw i64 %14, %15
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A
-; KERNEL-IR-NEXT:   %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4
-; KERNEL-IR-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
-; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
-; KERNEL-IR-NEXT:   %19 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %20 = add nsw i64 %19, %t1
-; KERNEL-IR-NEXT:   %21 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %22 = add nsw i64 %20, %21
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3
-; KERNEL-IR-NEXT:   store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4
-; KERNEL-IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; KERNEL-IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 0
-; KERNEL-IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; KERNEL-IR-LABEL: polly.loop_preheader:                             ; preds = %entry
-; KERNEL-IR-NEXT:   br label %polly.loop_header
-
-; KERNEL-IR: attributes #0 = { "polly.skip.fn" }
-
-; KERNEL-ASM: .version 3.2
-; KERNEL-ASM-NEXT: .target sm_30
-; KERNEL-ASM-NEXT: .address_size 64
-
-; KERNEL-ASM:   // .globl     kernel_0
-
-; KERNEL-ASM: .visible .entry kernel_0(
-; KERNEL-ASM-NEXT:   .param .u64 kernel_0_param_0
-; KERNEL-ASM-NEXT: )
-
-;    void double_parallel_loop(float A[][1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        for (long j = 0; j < 1024; j++)
-;          A[i][j] += i * j;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @double_parallel_loop(ptr %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb13, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
-  %exitcond1 = icmp ne i64 %i.0, 1024
-  br i1 %exitcond1, label %bb3, label %bb15
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb10, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %j.0, 1024
-  br i1 %exitcond, label %bb5, label %bb12
-
-bb5:                                              ; preds = %bb4
-  %tmp = mul nuw nsw i64 %i.0, %j.0
-  %tmp6 = sitofp i64 %tmp to float
-  %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0
-  %tmp8 = load float, ptr %tmp7, align 4
-  %tmp9 = fadd float %tmp8, %tmp6
-  store float %tmp9, ptr %tmp7, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb5
-  %tmp11 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12
-  %tmp14 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb15:                                             ; preds = %bb2
-  ret void
-}
--- a/polly/test/GPGPU/failing-invariant-load-handling.ll
+++ b/polly/test/GPGPU/failing-invariant-load-handling.ll
@ -1,57 +0,0 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
-
-%S = type { i32, i32, [12 x %L] }
-%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
-
-define void @test(ptr %cpi, i1 %b) {
-; SCOPS-LABEL: Region: %if.then14---%exit
-; SCOPS:         Invariant Accesses: {
-; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] };
-; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  :  }
-; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] };
-; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  : l2 > 0 }
-; SCOPS-NEXT:    }
-; SCOPS:         Arrays {
-; SCOPS-NEXT:        i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4
-; SCOPS-NEXT:    }
-
-; Check that we gracefully handle failing invariant loads.
-; This test case is taken from:
-; test/Isl/CodeGen/invariant-load-dimension.ll
-
-; FIXME: Figure out how to actually generate code for this loop.
-; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function
-
-entry:
-  %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
-  br i1 %b, label %if.then14, label %exit
-
-if.then14:
-  %l0 = load i32, ptr %cpi, align 8
-  %cmp12.i = icmp sgt i32 %l0, 0
-  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
-
-for.body.lr.ph.i:
-  %l1 = load i32, ptr %nt, align 4
-  br label %for.body.i
-
-for.body.i:
-  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
-  %mul.i163 = mul nsw i32 %phi, %l1
-  %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
-  store i32 0, ptr %cv, align 8
-  %inc = add nuw nsw i32 %phi, 1
-  %l2 = load i32, ptr %cpi, align 8
-  %cmp.i164 = icmp slt i32 %inc, %l2
-  br i1 %cmp.i164, label %for.body.i, label %exit
-
-exit:
-  ret void
-}
--- a/polly/test/GPGPU/failing-invariant-load-hoisting.ll
+++ b/polly/test/GPGPU/failing-invariant-load-hoisting.ll
@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
-
-%S = type { i32, i32, [12 x %L] }
-%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
-
-define void @test(ptr %cpi, i1 %b) {
-; CODEGEN-LABEL: @test(
-; CODEGEN:    polly.preload.begin:
-; CODEGEN-NEXT:  br i1 false
-
-entry:
-  %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
-  br i1 %b, label %if.then14, label %exit
-
-if.then14:
-  %l0 = load i32, ptr %cpi, align 8
-  %cmp12.i = icmp sgt i32 %l0, 0
-  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
-
-for.body.lr.ph.i:
-  %l1 = load i32, ptr %nt, align 4
-  br label %for.body.i
-
-for.body.i:
-  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
-  %mul.i163 = mul nsw i32 %phi, %l1
-  %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
-  store i32 0, ptr %cv, align 8
-  %inc = add nuw nsw i32 %phi, 1
-  %l2 = load i32, ptr %cpi, align 8
-  %cmp.i164 = icmp slt i32 %inc, %l2
-  br i1 %cmp.i164, label %for.body.i, label %exit
-
-exit:
-  ret void
-}
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@ -1,176 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
-; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
-; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | FileCheck %s -check-prefix=IR
-;    void foo(float A[2][100]) {
-;      for (long t = 0; t < 100; t++)
-;        for (long i = 1; i < 99; i++)
-;          A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
-;    }
-
-; REQUIRES: pollyacc
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   for (int c0 = 0; c0 <= 99; c0 += 1)
-; CODE-NEXT:     {
-; CODE-NEXT:       dim3 k0_dimBlock(32);
-; CODE-NEXT:       dim3 k0_dimGrid(4);
-; CODE-NEXT:       kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT: }
-
-; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
-; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
-; ...
-; IR:  store i64 %polly.indvar, i64* %polly_launch_0_param_1
-; IR-NEXT:  [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; IR-NEXT:  [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
-; IR-NEXT:  store i8* [[REGB]], i8** [[REGA]]
-; IR: call i8* @polly_getKernel
-; ...
-; IR: call void @polly_freeKernel
-; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
-; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
-; KERNEL-IR-LABEL: entry:
-; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
-; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-IR-NEXT:   %t0 = zext i32 %1 to i64
-; KERNEL-IR-NEXT:   br label %polly.cond
-
-; KERNEL-IR-LABEL: polly.cond:                                       ; preds = %entry
-; KERNEL-IR-NEXT:   %2 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %3 = add nsw i64 %2, %t0
-; KERNEL-IR-NEXT:   %4 = icmp sle i64 %3, 97
-; KERNEL-IR-NEXT:   br i1 %4, label %polly.then, label %polly.else
-
-; KERNEL-IR-LABEL: polly.merge:                                      ; preds = %polly.else, %polly.stmt.for.body3
-; KERNEL-IR-NEXT:   ret void
-
-; KERNEL-IR-LABEL: polly.then:                                       ; preds = %polly.cond
-; KERNEL-IR-NEXT:   %5 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %6 = add nsw i64 %5, %t0
-; KERNEL-IR-NEXT:   br label %polly.stmt.for.body3
-
-; KERNEL-IR-LABEL: polly.stmt.for.body3:                             ; preds = %polly.then
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
-; KERNEL-IR-NEXT:   %7 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %8 = add nsw i64 %7, %t0
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
-; KERNEL-IR-NEXT:   %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r2 = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
-; KERNEL-IR-NEXT:   %9 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %10 = add nsw i64 %9, %t0
-; KERNEL-IR-NEXT:   %11 = add nsw i64 %10, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
-; KERNEL-IR-NEXT:   %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4
-; KERNEL-IR-NEXT:   %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r7 = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
-; KERNEL-IR-NEXT:   %12 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %13 = add nsw i64 %12, %t0
-; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, 2
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
-; KERNEL-IR-NEXT:   %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4
-; KERNEL-IR-NEXT:   %p_add12 = fadd float %p_add, %tmp3_p_scalar_
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %15 = add nsw i64 %c0, 1
-; KERNEL-IR-NEXT:   %pexp.pdiv_r12 = urem i64 %15, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
-; KERNEL-IR-NEXT:   %16 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %17 = add nsw i64 %16, %t0
-; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
-; KERNEL-IR-NEXT:   %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4
-; KERNEL-IR-NEXT:   %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %19 = add nsw i64 %c0, 1
-; KERNEL-IR-NEXT:   %pexp.pdiv_r17 = urem i64 %19, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
-; KERNEL-IR-NEXT:   %20 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %21 = add nsw i64 %20, %t0
-; KERNEL-IR-NEXT:   %22 = add nsw i64 %21, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
-; KERNEL-IR-NEXT:   store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4
-; KERNEL-IR-NEXT:   br label %polly.merge
-
-; KERNEL-IR-LABEL: polly.else:                                       ; preds = %polly.cond
-; KERNEL-IR-NEXT:   br label %polly.merge
-; KERNEL-IR-NEXT: }
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo([100 x float]* %A) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc18, %entry
-  %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
-  %exitcond1 = icmp ne i64 %t.0, 100
-  br i1 %exitcond1, label %for.body, label %for.end20
-
-for.body:                                         ; preds = %for.cond
-  br label %for.cond1
-
-for.cond1:                                        ; preds = %for.inc, %for.body
-  %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
-  %exitcond = icmp ne i64 %i.0, 99
-  br i1 %exitcond, label %for.body3, label %for.end
-
-for.body3:                                        ; preds = %for.cond1
-  %sub = add nsw i64 %i.0, -1
-  %rem = srem i64 %t.0, 2
-  %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
-  %tmp = load float, float* %arrayidx4, align 4
-  %rem5 = srem i64 %t.0, 2
-  %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
-  %tmp2 = load float, float* %arrayidx7, align 4
-  %add = fadd float %tmp, %tmp2
-  %add8 = add nuw nsw i64 %i.0, 1
-  %rem9 = srem i64 %t.0, 2
-  %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
-  %tmp3 = load float, float* %arrayidx11, align 4
-  %add12 = fadd float %add, %tmp3
-  %add13 = add nuw nsw i64 %t.0, 1
-  %rem14 = srem i64 %add13, 2
-  %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
-  %tmp4 = load float, float* %arrayidx16, align 4
-  %add17 = fadd float %tmp4, %add12
-  store float %add17, float* %arrayidx16, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body3
-  %inc = add nuw nsw i64 %i.0, 1
-  br label %for.cond1
-
-for.end:                                          ; preds = %for.cond1
-  br label %for.inc18
-
-for.inc18:                                        ; preds = %for.end
-  %inc19 = add nuw nsw i64 %t.0, 1
-  br label %for.cond
-
-for.end20:                                        ; preds = %for.cond
-  ret void
-}
--- a/polly/test/GPGPU/host-statement.ll
+++ b/polly/test/GPGPU/host-statement.ll
@ -1,204 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-invariant-load-hoisting=false \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -polly-invariant-load-hoisting=false \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.lifetime.start(i64, ptr nocapture) #0
-
-; This test case tests that we can correctly handle a ScopStmt that is
-; scheduled on the host, instead of within a kernel.
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(16);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   if (p_0 <= 510 && p_1 <= 510) {
-; CODE-NEXT:     {
-; CODE-NEXT:       dim3 k1_dimBlock(32);
-; CODE-NEXT:       dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
-; CODE-NEXT:       kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:     {
-; CODE-NEXT:       dim3 k2_dimBlock(16, 32);
-; CODE-NEXT:       dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
-; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:   }
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:     Stmt_for_cond33_preheader_last();
-
-; CODE: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
-
-; CODE: # kernel1
-; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
-; CODE-NEXT:   for (int c1 = 0; c1 <= 15; c1 += 1) {
-; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0)
-; CODE-NEXT:       Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510)
-; CODE-NEXT:       for (int c3 = 0; c3 <= 31; c3 += 1)
-; CODE-NEXT:         Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
-; CODE-NEXT:     sync0();
-; CODE-NEXT:   }
-
-; CODE: # kernel2
-; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
-; CODE-NEXT:   if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
-; CODE-NEXT:     for (int c3 = 0; c3 <= 1; c3 += 1)
-; CODE-NEXT:       Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
-
-; KERNEL-IR: call void @llvm.nvvm.barrier0()
-
-; Function Attrs: nounwind uwtable
-define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %entry.split, %for.inc86
-  %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
-  %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.cond1.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
-  %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
-  %arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
-  %tmp = load double, ptr %arrayidx5, align 8, !tbaa !1
-  %arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
-  %tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1
-  %mul = fmul double %tmp, %tmp27
-  %add = fadd double %nrm.02, %mul
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 512
-  br i1 %exitcond, label %for.inc, label %for.end
-
-for.end:                                          ; preds = %for.inc
-  %add.lcssa = phi double [ %add, %for.inc ]
-  %call = tail call double @sqrt(double %add.lcssa) #2
-  %arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
-  store double %call, ptr %arrayidx13, align 8, !tbaa !1
-  br label %for.body16
-
-for.cond33.preheader:                             ; preds = %for.body16
-  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
-  %cmp347 = icmp slt i64 %indvars.iv.next25, 512
-  br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
-
-for.body35.lr.ph:                                 ; preds = %for.cond33.preheader
-  br label %for.body35
-
-for.body16:                                       ; preds = %for.end, %for.body16
-  %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
-  %arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24
-  %tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1
-  %arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
-  %tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1
-  %div = fdiv double %tmp28, %tmp29
-  %arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24
-  store double %div, ptr %arrayidx28, align 8, !tbaa !1
-  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
-  %exitcond12 = icmp ne i64 %indvars.iv.next11, 512
-  br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
-
-for.cond33.loopexit:                              ; preds = %for.body62
-  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
-  %exitcond23 = icmp ne i32 %lftr.wideiv, 512
-  br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
-
-for.body35:                                       ; preds = %for.body35.lr.ph, %for.cond33.loopexit
-  %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
-  %arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1
-  br label %for.body42
-
-for.cond60.preheader:                             ; preds = %for.body42
-  br label %for.body62
-
-for.body42:                                       ; preds = %for.body35, %for.body42
-  %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
-  %arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24
-  %tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1
-  %arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21
-  %tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1
-  %mul51 = fmul double %tmp30, %tmp31
-  %arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  %tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1
-  %add56 = fadd double %tmp32, %mul51
-  store double %add56, ptr %arrayidx55, align 8, !tbaa !1
-  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
-  %exitcond15 = icmp ne i64 %indvars.iv.next14, 512
-  br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
-
-for.body62:                                       ; preds = %for.cond60.preheader, %for.body62
-  %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
-  %arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
-  %tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1
-  %arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24
-  %tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1
-  %arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  %tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1
-  %mul75 = fmul double %tmp34, %tmp35
-  %sub = fsub double %tmp33, %mul75
-  %arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
-  store double %sub, ptr %arrayidx79, align 8, !tbaa !1
-  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
-  %exitcond18 = icmp ne i64 %indvars.iv.next17, 512
-  br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
-
-for.cond33.for.inc86_crit_edge:                   ; preds = %for.cond33.loopexit
-  br label %for.inc86
-
-for.inc86:                                        ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond26 = icmp ne i64 %indvars.iv.next25, 512
-  br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
-
-for.end88:                                        ; preds = %for.inc86
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #0
-
-; Function Attrs: nounwind
-declare double @sqrt(double) #2
-
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"double", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
--- a/polly/test/GPGPU/ignore-parameter-bounds.ll
+++ b/polly/test/GPGPU/ignore-parameter-bounds.ll
@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-; CODE: Code
-; CODE: ====
-; CODE: No code generated
-
-source_filename = "bugpoint-output-83bcdeb.bc"
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-@__data_radiation_MOD_cobi = external global [168 x double], align 32
-
-; Function Attrs: nounwind uwtable
-define void @__radiation_rg_MOD_coe_so() #0 {
-entry:
-  %polly.access.kspec.load = load i32, ptr undef, align 4
-  %0 = or i1 undef, undef
-  br label %polly.preload.cond29
-
-polly.preload.cond29:                             ; preds = %entry
-  br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30
-
-polly.preload.merge30:                            ; preds = %polly.preload.exec31, %polly.preload.cond29
-  %polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ]
-  ret void
-
-polly.preload.exec31:                             ; preds = %polly.preload.cond29
-  %1 = sext i32 %polly.access.kspec.load to i64
-  %2 = mul nsw i64 7, %1
-  %3 = add nsw i64 0, %2
-  %4 = add nsw i64 %3, 48
-  %polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4
-  %polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8
-  br label %polly.preload.merge30
-}
-
-attributes #0 = { nounwind uwtable }
--- a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
+++ b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
@ -1,76 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR
-; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg  < %s | FileCheck %s --check-prefix=HOST-IR
-
-; Test that we do recognise and codegen a kernel that has intrinsics.
-
-; REQUIRES: pollyacc
-
-; Check that we model the kernel as a scop.
-; SCOP:      Function: f
-; SCOP-NEXT:       Region: %entry.split---%for.end
-
-; Check that the intrinsic call is present in the kernel IR.
-; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
-; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
-; KERNEL-IR:   declare float @llvm.fabs.f32(float)
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
-
-
-; void f(float *A, float *B, int N) {
-;   for(int i = 0; i < N; i++) {
-;       float tmp0 = A[i];
-;       float tmp1 = sqrt(tmp1);
-;       float tmp2 = fabs(tmp2);
-;       float tmp3 = copysignf(tmp1, tmp2);
-;       B[i] = tmp4;
-;   }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(float* %A, float* %B, i32 %N) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp1 = icmp sgt i32 %N, 0
-  br i1 %cmp1, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %A.arr.i.val = load float, float* %A.arr.i, align 4
-  ; Call to intrinsics that should be part of the kernel.
-  %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
-  %fabs = tail call float @llvm.fabs.f32(float %sqrt);
-  %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
-  %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %copysign, float* %B.arr.i, align 4
-
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %wide.trip.count = zext i32 %N to i64
-  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.sqrt.f32(float) #0
-declare float @llvm.fabs.f32(float) #0
-declare float @llvm.copysign.f32(float, float) #0
-
-attributes #0 = { nounwind readnone }
-
--- a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
+++ b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
@ -1,47 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg  -polly-acc-fail-on-verify-module-failure \
-; RUN: -disable-output < %s
-
-; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually
-; fail on an illegal module.
-
-; REQUIRES: pollyacc, asserts
-; XFAIL: *
-;
-;    void foo(long A[1024], long B[1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += (B[i] + (long)&B[i]);
-;    }
-
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb10, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb12
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 8
-  %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp5 = ptrtoint ptr %tmp4 to i64
-  %tmp6 = add nsw i64 %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp8 = load i64, ptr %tmp7, align 8
-  %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, ptr %tmp7, align 8
-  br label %bb10
-
-bb10:                                             ; preds = %bb2
-  %tmp11 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb12:                                             ; preds = %bb1
-  ret void
-}
--- a/polly/test/GPGPU/invalid-kernel.ll
+++ b/polly/test/GPGPU/invalid-kernel.ll
@ -1,73 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: not FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; REQUIRES: pollyacc
-;
-;    void foo(long A[1024], long B[1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += (B[i] + (long)&B[i]);
-;    }
-
-; This kernel loads/stores a pointer address we model. This is a rare case,
-; were we still lack proper code-generation support. We check here that we
-; detect the invalid IR and bail out gracefully.
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; KERNEL-IR: kernel
-
-; IR: br i1 false, label %polly.start, label %bb1
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb10, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb12
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 8
-  %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp5 = ptrtoint ptr %tmp4 to i64
-  %tmp6 = add nsw i64 %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp8 = load i64, ptr %tmp7, align 8
-  %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, ptr %tmp7, align 8
-  br label %bb10
-
-bb10:                                             ; preds = %bb2
-  %tmp11 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb12:                                             ; preds = %bb1
-  ret void
-}
--- a/polly/test/GPGPU/invariant-load-array-access.ll
+++ b/polly/test/GPGPU/invariant-load-array-access.ll
@ -1,70 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop.
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] };
-; SCOP-NEXT:         Execution Context: [tmp] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] };
-; SCOP-NEXT:         Execution Context: [tmp] -> {  : tmp >= 4 }
-; SCOP-NEXT: }
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; This test makes sure that such an access pattern is handled correctly
-; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
-; was the main reason that caused this test case to crash.
-;
-; void f(int *arr, const int *control, const int *readarr) {
-;     for(int i = 0; i < 1000; i++) {
-;         int t = 0;
-;         if (*control > 3) {
-;             t += *readarr;
-;         }
-;         arr[i] = t;
-;     }
-; }
-
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-define void @f(ptr %arr, ptr %control, ptr %readarr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %if.end
-  %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ]
-  %tmp = load i32, ptr %control, align 4
-  %cmp1 = icmp sgt i32 %tmp, 3
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %tmp1 = load i32, ptr %readarr, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01
-  store i32 %t.0, ptr %arrayidx, align 4
-  %inc = add nuw nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %inc, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %if.end
-  ret void
-}
--- a/Show More
+++ b/Show More