From 47b7c91abe7af3133a591aa2e73fffa30826f986 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 15:29:29 -0600
Subject: [PATCH] [libc] Rework the GPU build to be a regular target (#81921)

Summary:
This is a massive patch because it reworks the entire build and
everything that depends on it. This is not split up because various bots
would fail otherwise. I will attempt to describe the necessary changes
here.

This patch completely reworks how the GPU build is built and targeted.
Previously, we used a standard runtimes build and handled both NVPTX and
AMDGPU in a single build via multi-targeting. This added a lot of
divergence in the build system and prevented us from doing various
things like building for the CPU / GPU at the same time, or exporting
the startup libraries or running tests without a full rebuild.

The new appraoch is to handle the GPU builds as strict cross-compiling
runtimes. The first step required
https://github.com/llvm/llvm-project/pull/81557 to allow the `LIBC`
target to build for the GPU without touching the other targets. This
means that the GPU uses all the same handling as the other builds in
`libc`.

The new expected way to build the GPU libc is with
`LLVM_LIBC_RUNTIME_TARGETS=amdgcn-amd-amdhsa;nvptx64-nvidia-cuda`.

The second step was reworking how we generated the embedded GPU library
by moving it into the library install step. Where we previously had one
`libcgpu.a` we now have `libcgpu-amdgpu.a` and `libcgpu-nvptx.a`. This
patch includes the necessary clang / OpenMP changes to make that not
break the bots when this lands.

We unfortunately still require that the NVPTX target has an `internal`
target for tests. This is because the NVPTX target needs to do LTO for
the provided version (The offloading toolchain can handle it) but cannot
use it for the native toolchain which is used for making tests.

This approach is vastly superior in every way, allowing us to treat the
GPU as a standard cross-compiling target. We can now install the GPU
utilities to do things like use the offload tests and other fun things.

Some certain utilities need to be built with
`--target=${LLVM_HOST_TRIPLE}` as well. I think this is a fine
workaround as we
will always assume that the GPU `libc` is a cross-build with a
functioning host.

Depends on https://github.com/llvm/llvm-project/pull/81557
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  37 +-
 clang/test/Driver/openmp-offload-gpu.c        |  20 +-
 libc/CMakeLists.txt                           |  20 +-
 .../cmake/modules/LLVMLibCArchitectures.cmake |  28 +-
 libc/cmake/modules/LLVMLibCCheckMPFR.cmake    |   2 +-
 .../modules/LLVMLibCCompileOptionRules.cmake  |  76 +---
 libc/cmake/modules/LLVMLibCHeaderRules.cmake  |   2 +-
 libc/cmake/modules/LLVMLibCLibraryRules.cmake | 141 +++++--
 libc/cmake/modules/LLVMLibCObjectRules.cmake  | 346 ++++--------------
 libc/cmake/modules/LLVMLibCTestRules.cmake    |  47 ++-
 .../modules/prepare_libc_gpu_build.cmake      | 118 ++----
 libc/docs/gpu/using.rst                       |  33 +-
 libc/include/CMakeLists.txt                   |   6 +-
 libc/lib/CMakeLists.txt                       |  35 +-
 libc/src/__support/File/CMakeLists.txt        |   2 +-
 libc/src/__support/GPU/CMakeLists.txt         |   2 +-
 libc/src/__support/OSUtil/CMakeLists.txt      |   2 +-
 libc/src/__support/RPC/CMakeLists.txt         |   2 +-
 libc/src/math/CMakeLists.txt                  |  16 +-
 libc/src/math/gpu/vendor/CMakeLists.txt       |   1 -
 libc/src/stdio/CMakeLists.txt                 |   2 +-
 libc/src/stdlib/CMakeLists.txt                |   4 +-
 libc/src/string/CMakeLists.txt                |  12 +-
 libc/startup/gpu/CMakeLists.txt               |  35 +-
 libc/startup/gpu/amdgpu/CMakeLists.txt        |  13 -
 libc/startup/gpu/nvptx/CMakeLists.txt         |   9 -
 libc/test/CMakeLists.txt                      |   6 +-
 libc/test/IntegrationTest/CMakeLists.txt      |  16 -
 libc/test/UnitTest/CMakeLists.txt             |   2 +-
 libc/test/src/__support/CMakeLists.txt        |  49 +--
 libc/test/src/__support/CPP/CMakeLists.txt    |   2 +-
 libc/test/src/__support/File/CMakeLists.txt   |   2 +-
 libc/test/src/errno/CMakeLists.txt            |   2 +-
 libc/test/src/math/CMakeLists.txt             |  20 +-
 libc/test/src/math/smoke/CMakeLists.txt       |   8 +-
 libc/test/src/stdio/CMakeLists.txt            |   2 +-
 libc/test/src/stdlib/CMakeLists.txt           |   6 +-
 libc/test/utils/UnitTest/CMakeLists.txt       |   2 +-
 libc/utils/CMakeLists.txt                     |   2 +-
 libc/utils/MPFRWrapper/CMakeLists.txt         |   2 +-
 libc/utils/gpu/CMakeLists.txt                 |   4 +-
 libc/utils/gpu/loader/CMakeLists.txt          |  48 ++-
 libc/utils/gpu/loader/amdgpu/CMakeLists.txt   |   6 +-
 libc/utils/gpu/loader/nvptx/CMakeLists.txt    |  10 +-
 libc/utils/gpu/server/CMakeLists.txt          |   9 +
 llvm/CMakeLists.txt                           |   4 +-
 llvm/cmake/modules/HandleLLVMOptions.cmake    |   7 +
 llvm/runtimes/CMakeLists.txt                  |  11 +-
 openmp/libomptarget/CMakeLists.txt            |   9 +-
 .../plugins-nextgen/common/CMakeLists.txt     |   6 +-
 .../plugins-nextgen/common/src/RPC.cpp        |   3 +-
 openmp/libomptarget/test/lit.cfg              |   8 +-
 52 files changed, 589 insertions(+), 668 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e5196bd8b5ae..347b250260c4 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1087,10 +1087,41 @@ static void addOpenMPDeviceLibC(const ToolChain &TC, const ArgList &Args,
                           "llvm-libc-decls");
   bool HasLibC = llvm::sys::fs::exists(LibCDecls) &&
                  llvm::sys::fs::is_directory(LibCDecls);
-  if (Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, HasLibC)) {
-    CmdArgs.push_back("-lcgpu");
-    CmdArgs.push_back("-lmgpu");
+  if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, HasLibC))
+    return;
+
+  // We don't have access to the offloading toolchains here, so determine from
+  // the arguments if we have any active NVPTX or AMDGPU toolchains.
+  llvm::DenseSet<const char *> Libraries;
+  if (const Arg *Targets = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) {
+    if (llvm::any_of(Targets->getValues(),
+                     [](auto S) { return llvm::Triple(S).isAMDGPU(); })) {
+      Libraries.insert("-lcgpu-amdgpu");
+      Libraries.insert("-lmgpu-amdgpu");
+    }
+    if (llvm::any_of(Targets->getValues(),
+                     [](auto S) { return llvm::Triple(S).isNVPTX(); })) {
+      Libraries.insert("-lcgpu-nvptx");
+      Libraries.insert("-lmgpu-nvptx");
+    }
   }
+
+  for (StringRef Arch : Args.getAllArgValues(options::OPT_offload_arch_EQ)) {
+    if (llvm::any_of(llvm::split(Arch, ","), [](StringRef Str) {
+          return IsAMDGpuArch(StringToCudaArch(Str));
+        })) {
+      Libraries.insert("-lcgpu-amdgpu");
+      Libraries.insert("-lmgpu-amdgpu");
+    }
+    if (llvm::any_of(llvm::split(Arch, ","), [](StringRef Str) {
+          return IsNVIDIAGpuArch(StringToCudaArch(Str));
+        })) {
+      Libraries.insert("-lcgpu-nvptx");
+      Libraries.insert("-lmgpu-nvptx");
+    }
+  }
+
+  llvm::append_range(CmdArgs, Libraries);
 }
 
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index bccc5fd9483a..5da74a35d87a 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -393,14 +393,28 @@
 //
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
 // RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
+// RUN:      --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \
 // RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:      --offload-arch=sm_52 -gpulibc -nogpuinc %s 2>&1 \
+// RUN:      --rocm-path=%S/Inputs/rocm \
+// RUN:      --offload-arch=sm_52,gfx803 -gpulibc -nogpuinc %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=LIBC-GPU %s
-// LIBC-GPU: "-lcgpu"{{.*}}"-lmgpu"
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
+// RUN:      --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \
+// RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
+// RUN:      --rocm-path=%S/Inputs/rocm \
+// RUN:      -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 \
+// RUN:      -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 \
+// RUN:      -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -gpulibc -nogpuinc %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=LIBC-GPU %s
+// LIBC-GPU-DAG: "-lcgpu-amdgpu"
+// LIBC-GPU-DAG: "-lmgpu-amdgpu"
+// LIBC-GPU-DAG: "-lcgpu-nvptx"
+// LIBC-GPU-DAG: "-lmgpu-nvptx"
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
 // RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
 // RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
 // RUN:      --offload-arch=sm_52 -nogpulibc -nogpuinc %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NO-LIBC-GPU %s
-// NO-LIBC-GPU-NOT: "-lcgpu"{{.*}}"-lmgpu"
+// NO-LIBC-GPU-NOT: -lmgpu{{.*}}-lcgpu
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 9f9839423499..6a57fcec26e4 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -43,7 +43,7 @@ set(LIBC_NAMESPACE "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LL
   CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."
 )
 
-if(LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+if(LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD)
   if(NOT LIBC_HDRGEN_EXE)
     # We need to set up hdrgen first since other targets depend on it.
     add_subdirectory(utils/LibcTableGenUtil)
@@ -77,7 +77,7 @@ if(LIBC_HDRGEN_ONLY OR NEED_LIBC_HDRGEN)
   # to build libc-hdrgen and return.
 
   # Always make the RPC server availible to other projects for GPU mode.
-  if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+  if(LLVM_LIBC_GPU_BUILD)
     add_subdirectory(utils/gpu/server)
   endif()
   return()
@@ -118,7 +118,7 @@ if(COMMAND_RETURN_CODE EQUAL 0)
   message(STATUS "Set COMPILER_RESOURCE_DIR to "
                  "${COMPILER_RESOURCE_DIR} using --print-resource-dir")
 else()
-  if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if (LIBC_TARGET_OS_IS_GPU)
     message(FATAL_ERROR "COMPILER_RESOURCE_DIR must be set for GPU builds")
   else()
     set(COMPILER_RESOURCE_DIR OFF)
@@ -216,11 +216,7 @@ foreach(config_path IN LISTS LIBC_CONFIG_JSON_FILE_LIST)
   load_libc_config(${config_path}/config.json ${cmd_line_conf})
 endforeach()
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-  set(LIBC_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
-  set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm)
-  set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
-elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
+if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
   set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
@@ -235,7 +231,11 @@ else()
     set(LIBC_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include)
     set(LIBC_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX})
   endif()
-  set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR})
+  if(LIBC_TARGET_OS_IS_GPU)
+    set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
+  else()
+    set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR})
+  endif()
 endif()
 
 if(LIBC_TARGET_TRIPLE)
@@ -247,7 +247,7 @@ else()
   set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX})
 endif()
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   include(prepare_libc_gpu_build)
   set(LIBC_ENABLE_UNITTESTS OFF)
 endif()
diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index 623ed774be72..0dbc59ad643a 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -6,18 +6,6 @@
 # platform.
 # ------------------------------------------------------------------------------
 
-if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
-  # We set the generic target and OS to "gpu" here. More specific defintions
-  # for the exact target GPU are set up in prepare_libc_gpu_build.cmake.
-  set(LIBC_TARGET_OS "gpu")
-  set(LIBC_TARGET_ARCHITECTURE_IS_GPU TRUE)
-  set(LIBC_TARGET_ARCHITECTURE "gpu")
-  if(LIBC_TARGET_TRIPLE)
-    message(WARNING "LIBC_TARGET_TRIPLE is ignored as LIBC_GPU_BUILD is on. ")
-  endif()
-  return()
-endif()
-
 if(MSVC)
   # If the compiler is visual c++ or equivalent, we will assume a host build.
   set(LIBC_TARGET_OS ${CMAKE_HOST_SYSTEM_NAME})
@@ -59,6 +47,10 @@ function(get_arch_and_system_from_triple triple arch_var sys_var)
     set(target_arch "riscv32")
   elseif(target_arch MATCHES "^riscv64")
     set(target_arch "riscv64")
+  elseif(target_arch MATCHES "^amdgcn")
+    set(target_arch "amdgpu")
+  elseif(target_arch MATCHES "^nvptx64")
+    set(target_arch "nvptx")
   else()
     return()
   endif()
@@ -75,6 +67,12 @@ function(get_arch_and_system_from_triple triple arch_var sys_var)
     set(target_sys "darwin")
   endif()
 
+  # Setting OS name for GPU architectures.
+  list(GET triple_comps -1 gpu_target_sys)
+  if(gpu_target_sys MATCHES "^amdhsa" OR gpu_target_sys MATCHES "^cuda")
+    set(target_sys "gpu")
+  endif()
+
   set(${sys_var} ${target_sys} PARENT_SCOPE)
 endfunction(get_arch_and_system_from_triple)
 
@@ -156,6 +154,10 @@ elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv64")
 elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv32")
   set(LIBC_TARGET_ARCHITECTURE_IS_RISCV32 TRUE)
   set(LIBC_TARGET_ARCHITECTURE "riscv")
+elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "amdgpu")
+  set(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU TRUE)
+elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "nvptx")
+  set(LIBC_TARGET_ARCHITECTURE_IS_NVPTX TRUE)
 else()
   message(FATAL_ERROR
           "Unsupported libc target architecture ${LIBC_TARGET_ARCHITECTURE}")
@@ -178,6 +180,8 @@ elseif(LIBC_TARGET_OS STREQUAL "darwin")
   set(LIBC_TARGET_OS_IS_DARWIN TRUE)
 elseif(LIBC_TARGET_OS STREQUAL "windows")
   set(LIBC_TARGET_OS_IS_WINDOWS TRUE)
+elseif(LIBC_TARGET_OS STREQUAL "gpu")
+  set(LIBC_TARGET_OS_IS_GPU TRUE)
 else()
   message(FATAL_ERROR
           "Unsupported libc target operating system ${LIBC_TARGET_OS}")
diff --git a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
index 9e361f5fd811..bbaeb9f0dc05 100644
--- a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
@@ -2,7 +2,7 @@ set(LLVM_LIBC_MPFR_INSTALL_PATH "" CACHE PATH "Path to where MPFR is installed (
 
 if(LLVM_LIBC_MPFR_INSTALL_PATH)
   set(LIBC_TESTS_CAN_USE_MPFR TRUE)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   set(LIBC_TESTS_CAN_USE_MPFR FALSE)
 else()
   try_compile(
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 140e4d51a9c2..33ba5da4f8d5 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -82,10 +82,22 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "/EHs-c-")
     list(APPEND compile_options "/GR-")
   endif()
-  if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if (LIBC_TARGET_OS_IS_GPU)
     list(APPEND compile_options "-nogpulib")
     list(APPEND compile_options "-fvisibility=hidden")
     list(APPEND compile_options "-fconvergent-functions")
+    list(APPEND compile_options "-flto")
+
+    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+      list(APPEND compile_options "-Wno-unknown-cuda-version")
+      list(APPEND compile_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
+      list(APPEND compile_options "--cuda-feature=+ptx63")
+      if(LIBC_CUDA_ROOT)
+        list(APPEND compile_options "--cuda-path=${LIBC_CUDA_ROOT}")
+      endif()
+    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+      list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none")
+    endif()
 
     # Manually disable all standard include paths and include the resource
     # directory to prevent system headers from being included.
@@ -138,73 +150,21 @@ function(_get_common_test_compile_options output_var flags)
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
-# Obtains NVPTX specific arguments for compilation.
-# The PTX feature is primarily based on the CUDA toolchain version. We want to
-# be able to target NVPTX without an existing CUDA installation, so we need to
-# set this manually. This simply sets the PTX feature to the minimum required
-# for the features we wish to use on that target. The minimum PTX features used
-# here roughly corresponds to the CUDA 9.0 release.
-# Adjust as needed for desired PTX features.
-function(get_nvptx_compile_options output_var gpu_arch)
-  set(nvptx_options "")
-  list(APPEND nvptx_options "-march=${gpu_arch}")
-  list(APPEND nvptx_options "-Wno-unknown-cuda-version")
-  list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
-  if(${gpu_arch} STREQUAL "sm_35")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_37")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_50")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_52")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_53")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_60")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_61")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_62")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_70")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_72")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_75")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_80")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_86")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_89")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_90")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  else()
-    message(FATAL_ERROR "Unknown Nvidia GPU architecture '${gpu_arch}'")
-  endif()
-
-  if(LIBC_CUDA_ROOT)
-    list(APPEND nvptx_options "--cuda-path=${LIBC_CUDA_ROOT}")
-  endif()
-  set(${output_var} ${nvptx_options} PARENT_SCOPE)
-endfunction()
-
 function(_get_hermetic_test_compile_options output_var flags)
   _get_compile_options_from_flags(compile_flags ${flags})
   list(APPEND compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags}
        ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti)
 
   # The GPU build requires overriding the default CMake triple and architecture.
-  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     list(APPEND compile_options
          -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-         --target=${LIBC_GPU_TARGET_TRIPLE}
          -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
-  elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-    get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     list(APPEND compile_options
-         -nogpulib ${nvptx_options} -fno-use-cxa-atexit --target=${LIBC_GPU_TARGET_TRIPLE})
+         "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
+         --cuda-path=${LIBC_CUDA_ROOT}
+         -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 9e9b598721ab..19515b1cbcc1 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -139,7 +139,7 @@ function(add_gen_header target_name)
             ${hdrgen_deps}
   )
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if(LIBC_TARGET_OS_IS_GPU)
     file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
     set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
     add_custom_command(
diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index 81c207ec2317..f15ffd5f9c21 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -50,31 +50,9 @@ function(collect_object_file_deps target result)
   endif()
 endfunction(collect_object_file_deps)
 
-# A rule to build a library from a collection of entrypoint objects.
-# Usage:
-#     add_entrypoint_library(
-#       DEPENDS <list of add_entrypoint_object targets>
-#     )
-#
-# NOTE: If one wants an entrypoint to be available in a library, then they will
-# have to list the entrypoint target explicitly in the DEPENDS list. Implicit
-# entrypoint dependencies will not be added to the library.
-function(add_entrypoint_library target_name)
-  cmake_parse_arguments(
-    "ENTRYPOINT_LIBRARY"
-    "" # No optional arguments
-    "" # No single value arguments
-    "DEPENDS" # Multi-value arguments
-    ${ARGN}
-  )
-  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
-    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
-                        "of 'add_entrypoint_object' targets.")
-  endif()
-
-  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+function(get_all_object_file_deps result fq_deps_list)
   set(all_deps "")
-  foreach(dep IN LISTS fq_deps_list)
+  foreach(dep ${fq_deps_list})
     get_target_property(dep_type ${dep} "TARGET_TYPE")
     if(NOT ((${dep_type} STREQUAL ${ENTRYPOINT_OBJ_TARGET_TYPE}) OR
             (${dep_type} STREQUAL ${ENTRYPOINT_EXT_TARGET_TYPE}) OR
@@ -102,6 +80,121 @@ function(add_entrypoint_library target_name)
     list(APPEND all_deps ${entrypoint_target})
   endforeach(dep)
   list(REMOVE_DUPLICATES all_deps)
+  set(${result} ${all_deps} PARENT_SCOPE)
+endfunction()
+
+# A rule to build a library from a collection of entrypoint objects and bundle
+# it into a GPU fatbinary. Usage is the same as 'add_entrypoint_library'.
+# Usage:
+#     add_gpu_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+function(add_gpu_entrypoint_library target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
+  # The GPU 'libc' needs to be exported in a format that can be linked with
+  # offloading langauges like OpenMP or CUDA. This wraps every GPU object into a
+  # fat binary and adds them to a static library.
+  set(objects "")
+  foreach(dep IN LISTS all_deps)
+    set(object $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
+    string(FIND ${dep} "." last_dot_loc REVERSE)
+    math(EXPR name_loc "${last_dot_loc} + 1")
+    string(SUBSTRING ${dep} ${name_loc} -1 name)
+    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+      set(prefix --image=arch=generic,triple=nvptx64-nvidia-cuda,feature=+ptx63)
+    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+      set(prefix --image=arch=generic,triple=amdgcn-amd-amdhsa)
+    endif()
+
+    # Use the 'clang-offload-packager' to merge these files into a binary blob.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary
+      COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
+              "${prefix},file=$<JOIN:${object},,file=>" -o 
+              ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
+      DEPENDS ${dep}
+      COMMENT "Packaging LLVM offloading binary for '${object}'"
+    )
+    add_custom_target(${dep}.__gpubin__ DEPENDS ${dep}
+                      "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+
+    # CMake does not permit setting the name on object files. In order to have
+    # human readable names we create an empty stub file with the entrypoint
+    # name. This empty file will then have the created binary blob embedded.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
+      DEPENDS ${dep} ${dep}.__gpubin__
+    )
+    add_custom_target(${dep}.__stub__ 
+                      DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
+
+    add_library(${dep}.__fatbin__
+      EXCLUDE_FROM_ALL OBJECT
+      "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+    )
+
+    # This is always compiled for the LLVM host triple instead of the native GPU
+    # triple that is used by default in the build. 
+    target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib)
+    target_compile_options(${dep}.__fatbin__ PRIVATE 
+      --target=${LLVM_HOST_TRIPLE}
+      "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+    add_dependencies(${dep}.__fatbin__ ${dep} ${dep}.__stub__ ${dep}.__gpubin__)
+
+    # Set the list of newly create fat binaries containing embedded device code.
+    list(APPEND objects $<TARGET_OBJECTS:${dep}.__fatbin__>)
+  endforeach()
+
+  add_library(
+    ${target_name}
+    STATIC
+      ${objects}
+  )
+  set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR})
+endfunction(add_gpu_entrypoint_library)
+
+# A rule to build a library from a collection of entrypoint objects.
+# Usage:
+#     add_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+#
+# NOTE: If one wants an entrypoint to be available in a library, then they will
+# have to list the entrypoint target explicitly in the DEPENDS list. Implicit
+# entrypoint dependencies will not be added to the library.
+function(add_entrypoint_library target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
   set(objects "")
   foreach(dep IN LISTS all_deps)
     list(APPEND objects $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 308ba7d0d5dd..78536f4eec55 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -1,175 +1,5 @@
 set(OBJECT_LIBRARY_TARGET_TYPE "OBJECT_LIBRARY")
 
-# Build the object target for a single GPU arch.
-# Usage:
-#     _build_gpu_object_for_single_arch(
-#       <target_name>
-#       <gpu_arch>
-#       SRCS <list of .cpp files>
-#       HDRS <list of .h files>
-#       DEPENDS <list of dependencies>
-#       COMPILE_OPTIONS <optional list of special compile options for this target>
-#       FLAGS <optional list of flags>
-#     )
-function(_build_gpu_object_for_single_arch fq_target_name gpu_arch)
-  cmake_parse_arguments(
-    "ADD_GPU_OBJ"
-    "" # No optional arguments
-    "NAME;CXX_STANDARD" # Single value arguments
-    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
-    ${ARGN}
-  )
-
-  if(NOT ADD_GPU_OBJ_CXX_STANDARD)
-    set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-  endif()
-
-  set(compile_options ${ADD_GPU_OBJ_COMPILE_OPTIONS})
-  # Derive the triple from the specified architecture.
-  if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    set(gpu_target_triple ${AMDGPU_TARGET_TRIPLE})
-    list(APPEND compile_options "-mcpu=${gpu_arch}")
-    list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none")
-    list(APPEND compile_options "-emit-llvm")
-  elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    set(gpu_target_triple ${NVPTX_TARGET_TRIPLE})
-    get_nvptx_compile_options(nvptx_options ${gpu_arch})
-    list(APPEND compile_options "${nvptx_options}")
-  else()
-    message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
-  endif()
-  list(APPEND compile_options "--target=${gpu_target_triple}")
-
-  # Build the library for this target architecture. We always emit LLVM-IR for
-  # packaged GPU binaries.
-  add_library(${fq_target_name}
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_GPU_OBJ_SRCS}
-    ${ADD_GPU_OBJ_HDRS}
-  )
-
-  target_compile_options(${fq_target_name} PRIVATE ${compile_options})
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  set_target_properties(${fq_target_name} PROPERTIES CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD})
-  if(ADD_GPU_OBJ_DEPENDS)
-    add_dependencies(${fq_target_name} ${ADD_GPU_OBJ_DEPENDS})
-    set_target_properties(${fq_target_name} PROPERTIES DEPS "${ADD_GPU_OBJ_DEPENDS}")
-  endif()
-endfunction(_build_gpu_object_for_single_arch)
-
-# Build the object target for the GPU.
-# This compiles the target for all supported architectures and embeds it into
-# host binary for installing.
-# Usage:
-#     _build_gpu_object_bundle(
-#       <target_name>
-#       SRCS <list of .cpp files>
-#       HDRS <list of .h files>
-#       DEPENDS <list of dependencies>
-#       COMPILE_OPTIONS <optional list of special compile options for this target>
-#       FLAGS <optional list of flags>
-#     )
-function(_build_gpu_object_bundle fq_target_name)
-  cmake_parse_arguments(
-    "ADD_GPU_OBJ"
-    "" # No optional arguments
-    "NAME;CXX_STANDARD" # Single value arguments
-    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
-    ${ARGN}
-  )
-
-  if(NOT ADD_GPU_OBJ_CXX_STANDARD)
-    set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-  endif()
-
-  foreach(add_gpu_obj_src ${ADD_GPU_OBJ_SRCS})
-    # The packaged version will be built for every target GPU architecture. We do
-    # this so we can support multiple accelerators on the same machine.
-    foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
-      get_filename_component(src_name ${add_gpu_obj_src} NAME)
-      set(gpu_target_name ${fq_target_name}.${src_name}.${gpu_arch})
-
-      _build_gpu_object_for_single_arch(
-        ${gpu_target_name}
-        ${gpu_arch}
-        CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD}
-        HDRS ${ADD_GPU_OBJ_HDRS}
-        SRCS ${add_gpu_obj_src}
-        COMPILE_OPTIONS
-          ${ADD_GPU_OBJ_COMPILE_OPTIONS}
-          "-emit-llvm"
-        DEPENDS ${ADD_GPU_OBJ_DEPENDS}
-      )
-      # Append this target to a list of images to package into a single binary.
-      set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
-      if("${gpu_arch}" IN_LIST all_nvptx_architectures)
-        get_nvptx_compile_options(nvptx_options ${gpu_arch})
-        string(REGEX MATCH "\\+ptx[0-9]+" nvptx_ptx_feature ${nvptx_options})
-        list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${NVPTX_TARGET_TRIPLE},feature=${nvptx_ptx_feature})
-      else()
-        list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${AMDGPU_TARGET_TRIPLE})
-       endif()
-      list(APPEND gpu_target_objects ${input_file})
-    endforeach()
-
-    # After building the target for the desired GPUs we must package the output
-    # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for
-    # more information.
-    set(packaged_target_name ${fq_target_name}.${src_name}.__gpu__)
-    set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.${src_name}.gpubin)
-
-    add_custom_command(OUTPUT ${packaged_output_name}
-                       COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
-                               ${packager_images} -o ${packaged_output_name}
-                       DEPENDS ${gpu_target_objects} ${add_gpu_obj_src} ${ADD_GPU_OBJ_HDRS}
-                       COMMENT "Packaging LLVM offloading binary")
-    add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name})
-    list(APPEND packaged_gpu_names ${packaged_target_name})
-    list(APPEND packaged_gpu_binaries ${packaged_output_name})
-  endforeach()
-
-  # We create an empty 'stub' file for the host to contain the embedded device
-  # code. This will be packaged into 'libcgpu.a'.
-  # TODO: In the future we will want to combine every architecture for a target
-  #       into a single bitcode file and use that. For now we simply build for
-  #       every single one and let the offloading linker handle it.
-  string(FIND ${fq_target_name} "." last_dot_loc REVERSE)
-  math(EXPR name_loc "${last_dot_loc} + 1")
-  string(SUBSTRING ${fq_target_name} ${name_loc} -1 target_name)
-  set(stub_filename "${target_name}.cpp")
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}"
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs/
-    COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}
-    DEPENDS ${gpu_target_objects} ${ADD_GPU_OBJ_SRCS} ${ADD_GPU_OBJ_HDRS}
-  )
-  set(stub_target_name ${fq_target_name}.__stub__)
-  add_custom_target(${stub_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename})
-
-  add_library(
-    ${fq_target_name}
-    # We want an object library as the objects will eventually get packaged into
-    # an archive (like libcgpu.a).
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}
-  )
-  target_compile_options(${fq_target_name} BEFORE PRIVATE
-                         ${ADD_GPU_OBJ_COMPILE_OPTIONS} -nostdlib)
-  foreach(packaged_gpu_binary ${packaged_gpu_binaries})
-    target_compile_options(${fq_target_name} PRIVATE
-                           "SHELL:-Xclang -fembed-offload-object=${packaged_gpu_binary}")
-  endforeach()
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  add_dependencies(${fq_target_name}
-                   ${full_deps_list} ${packaged_gpu_names} ${stub_target_name})
-endfunction()
-
 # Rule which is essentially a wrapper over add_library to compile a set of
 # sources to object files.
 # Usage:
@@ -214,53 +44,37 @@ function(create_object_library fq_target_name)
     message(FATAL_ERROR "'add_object_library' rule requires SRCS to be specified.")
   endif()
 
-  # The GPU build uses a separate internal file.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND NOT ${ADD_OBJECT_NO_GPU_BUNDLE})
-    set(internal_target_name ${fq_target_name}.__internal__)
-    set(public_packaging_for_internal "")
-  else()
-    set(internal_target_name ${fq_target_name})
-    set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING")
-  endif()
+  set(internal_target_name ${fq_target_name}.__internal__)
+  set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING")
 
   _get_common_compile_options(compile_options "${ADD_OBJECT_FLAGS}")
   list(APPEND compile_options ${ADD_OBJECT_COMPILE_OPTIONS})
 
-  # GPU builds require special handling for the objects because we want to
-  # export several different targets at once, e.g. for both Nvidia and AMD.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    if(NOT ${ADD_OBJECT_NO_GPU_BUNDLE})
-      _build_gpu_object_bundle(
-        ${fq_target_name}
-        SRCS ${ADD_OBJECT_SRCS}
-        HDRS ${ADD_OBJECT_HDRS}
-        CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD}
-        COMPILE_OPTIONS ${compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING"
-        DEPENDS ${fq_deps_list}
-      )
-    endif()
-    # When the target for GPU is not bundled, internal_target_name is the same
-    # as fq_targetname
-    _build_gpu_object_for_single_arch(
-      ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
-      SRCS ${ADD_OBJECT_SRCS}
-      HDRS ${ADD_OBJECT_HDRS}
-      CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD}
-      COMPILE_OPTIONS ${compile_options} ${public_packaging_for_internal}
-      DEPENDS ${fq_deps_list}
-    )
-  else()
+  add_library(
+    ${fq_target_name}
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_OBJECT_SRCS}
+    ${ADD_OBJECT_HDRS}
+  )
+  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_compile_options(${fq_target_name} PRIVATE ${compile_options})
+
+  # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
+  # cannot handle it natively. Make a separate internal target for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX AND NOT LIBC_GPU_TESTS_DISABLED)
     add_library(
-      ${fq_target_name}
+      ${internal_target_name}
       EXCLUDE_FROM_ALL
       OBJECT
       ${ADD_OBJECT_SRCS}
       ${ADD_OBJECT_HDRS}
     )
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    target_compile_options(${fq_target_name} PRIVATE ${compile_options})
+    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_compile_options(${internal_target_name} PRIVATE ${compile_options}
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
 
   if(SHOW_INTERMEDIATE_OBJECTS)
@@ -290,13 +104,18 @@ function(create_object_library fq_target_name)
       FLAGS "${ADD_OBJECT_FLAGS}"
   )
 
+  # If we built a separate internal target we want to use those target objects
+  # for testing instead of the exported target.
+  set(target_objects ${fq_target_name})
   if(TARGET ${internal_target_name})
-    set_target_properties(
-      ${fq_target_name}
-      PROPERTIES
-        OBJECT_FILES "$<TARGET_OBJECTS:${internal_target_name}>"
-    )
+    set(target_objects ${internal_target_name})
   endif()
+
+  set_target_properties(
+    ${fq_target_name}
+    PROPERTIES
+      OBJECT_FILES "$<TARGET_OBJECTS:${target_objects}>"
+  )
 endfunction(create_object_library)
 
 function(add_object_library target_name)
@@ -389,12 +208,19 @@ function(create_entrypoint_object fq_target_name)
 
     get_target_property(object_file ${fq_dep_name} "OBJECT_FILE")
     get_target_property(object_file_raw ${fq_dep_name} "OBJECT_FILE_RAW")
-    add_library(
-      ${internal_target_name}
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${object_file_raw}
-    )
+
+    # If the system cannot build the GPU tests we simply make a dummy target.
+    if(LIBC_TARGET_OS_IS_GPU AND LIBC_GPU_TESTS_DISABLED)
+      add_custom_target(${internal_target_name})
+    else()
+      add_library(
+        ${internal_target_name}
+        EXCLUDE_FROM_ALL
+        OBJECT
+        ${object_file_raw}
+      )
+    endif()
+
     add_dependencies(${internal_target_name} ${fq_dep_name})
     add_library(
       ${fq_target_name}
@@ -441,60 +267,42 @@ function(create_entrypoint_object fq_target_name)
     endif()
   endif()
 
-  # GPU builds require special handling for the objects because we want to
-  # export several different targets at once, e.g. for both Nvidia and AMD.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    _build_gpu_object_bundle(
-      ${fq_target_name}
-      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
-      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
-      COMPILE_OPTIONS ${common_compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING"
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPENDS ${full_deps_list}
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-    )
-    _build_gpu_object_for_single_arch(
-      ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
-      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
-      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
-      COMPILE_OPTIONS ${common_compile_options}
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPENDS ${full_deps_list}
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-    )
-  else()
-    add_library(
-      ${internal_target_name}
-      # TODO: We don't need an object library for internal consumption.
-      # A future change should switch this to a normal static library.
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${ADD_ENTRYPOINT_OBJ_SRCS}
-      ${ADD_ENTRYPOINT_OBJ_HDRS}
-    )
-    target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
-    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    add_dependencies(${internal_target_name} ${full_deps_list})
-    target_link_libraries(${internal_target_name} ${full_deps_list})
+  add_library(
+    ${internal_target_name}
+    # TODO: We don't need an object library for internal consumption.
+    # A future change should switch this to a normal static library.
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_ENTRYPOINT_OBJ_SRCS}
+    ${ADD_ENTRYPOINT_OBJ_HDRS}
+  )
+  target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
+  target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  add_dependencies(${internal_target_name} ${full_deps_list})
+  target_link_libraries(${internal_target_name} ${full_deps_list})
 
-    add_library(
-      ${fq_target_name}
-      # We want an object library as the objects will eventually get packaged into
-      # an archive (like libc.a).
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${ADD_ENTRYPOINT_OBJ_SRCS}
-      ${ADD_ENTRYPOINT_OBJ_HDRS}
-    )
-    target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    add_dependencies(${fq_target_name} ${full_deps_list})
-    target_link_libraries(${fq_target_name} ${full_deps_list})
+  # The NVPTX target cannot use LTO for the internal targets used for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    target_compile_options(${internal_target_name} PRIVATE 
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
 
+  add_library(
+    ${fq_target_name}
+    # We want an object library as the objects will eventually get packaged into
+    # an archive (like libc.a).
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_ENTRYPOINT_OBJ_SRCS}
+    ${ADD_ENTRYPOINT_OBJ_HDRS}
+  )
+  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
+  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  add_dependencies(${fq_target_name} ${full_deps_list})
+  target_link_libraries(${fq_target_name} ${full_deps_list})
+
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 6ca9516ff7a0..373cbd685385 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -449,7 +449,7 @@ function(add_integration_test test_name)
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${INTEGRATION_TEST_SRCS}
     ${INTEGRATION_TEST_HDRS}
   )
@@ -461,8 +461,17 @@ function(add_integration_test test_name)
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
   else()
@@ -474,9 +483,10 @@ function(add_integration_test test_name)
   target_link_libraries(
     ${fq_build_target_name}
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<NOT:$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
-    libc.startup.${LIBC_TARGET_OS}.crt1
-    libc.test.IntegrationTest.test)
+    $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
+    libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+    libc.test.IntegrationTest.test${internal_suffix}
+  )
   add_dependencies(${fq_build_target_name}
                    libc.test.IntegrationTest.test
                    ${INTEGRATION_TEST_DEPENDS})
@@ -495,7 +505,7 @@ function(add_integration_test test_name)
   # makes `add_custom_target` construct the correct command and execute it.
   set(test_cmd
       ${INTEGRATION_TEST_ENV}
-      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_GPU}>:${gpu_loader_exe}>
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
       ${CMAKE_CROSSCOMPILING_EMULATOR}
       ${INTEGRATION_TEST_LOADER_ARGS}
       $<TARGET_FILE:${fq_build_target_name}> ${INTEGRATION_TEST_ARGS})
@@ -606,7 +616,7 @@ function(add_libc_hermetic_test test_name)
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${HERMETIC_TEST_SRCS}
     ${HERMETIC_TEST_HDRS}
   )
@@ -615,6 +625,8 @@ function(add_libc_hermetic_test test_name)
       RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
       #OUTPUT_NAME ${fq_target_name}
   )
+
+  _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
@@ -629,8 +641,17 @@ function(add_libc_hermetic_test test_name)
     endif()
   endforeach()
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
   else()
@@ -642,12 +663,12 @@ function(add_libc_hermetic_test test_name)
   target_link_libraries(
     ${fq_build_target_name}
     PRIVATE
-      libc.startup.${LIBC_TARGET_OS}.crt1
+      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
       ${link_libraries}
       LibcTest.hermetic
       LibcHermeticTestSupport.hermetic
       # The NVIDIA 'nvlink' linker does not currently support static libraries.
-      $<$<NOT:$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
   add_dependencies(${fq_build_target_name}
                    LibcTest.hermetic
                    libc.test.UnitTest.ErrnoSetterMatcher
@@ -660,7 +681,7 @@ function(add_libc_hermetic_test test_name)
   endif()
 
   set(test_cmd ${HERMETIC_TEST_ENV}
-      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
       $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
   add_custom_target(
     ${fq_target_name}
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 2086175bae6c..75beef86760c 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -1,23 +1,8 @@
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   message(FATAL_ERROR
           "libc build: Invalid attempt to set up GPU architectures.")
 endif()
 
-# Set up the target architectures to build the GPU libc for.
-set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942"
-                             "gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034"
-                             "gfx1035;gfx1036"
-                             "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151")
-set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-                            "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
-set(all_gpu_architectures
-    "${all_amdgpu_architectures};${all_nvptx_architectures}")
-set(LIBC_GPU_ARCHITECTURES "all" CACHE STRING
-    "List of GPU architectures to build the libc for.")
-set(AMDGPU_TARGET_TRIPLE "amdgcn-amd-amdhsa")
-set(NVPTX_TARGET_TRIPLE "nvptx64-nvidia-cuda")
-
 # Ensure the compiler is a valid clang when building the GPU target.
 set(req_ver "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
 if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
@@ -31,40 +16,6 @@ if(NOT LLVM_LIBC_FULL_BUILD)
                       "GPU.")
 endif()
 
-# Identify any locally installed AMD GPUs on the system using 'amdgpu-arch'.
-find_program(LIBC_AMDGPU_ARCH
-             NAMES amdgpu-arch NO_DEFAULT_PATH
-             PATHS ${LLVM_BINARY_DIR}/bin /opt/rocm/llvm/bin/)
-
-# Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'.
-find_program(LIBC_NVPTX_ARCH
-             NAMES nvptx-arch NO_DEFAULT_PATH
-             PATHS ${LLVM_BINARY_DIR}/bin)
-
-# Get the list of all natively supported GPU architectures.
-set(detected_gpu_architectures "")
-foreach(arch_tool ${LIBC_NVPTX_ARCH} ${LIBC_AMDGPU_ARCH})
-  if(arch_tool)
-    execute_process(COMMAND ${arch_tool}
-                    OUTPUT_VARIABLE arch_tool_output
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    string(REPLACE "\n" ";" arch_list "${arch_tool_output}")
-    list(APPEND detected_gpu_architectures "${arch_list}")
-  endif()
-endforeach()
-list(REMOVE_DUPLICATES detected_gpu_architectures)
-
-if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
-  set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBC_GPU_ARCHITECTURES STREQUAL "native")
-  if(NOT detected_gpu_architectures)
-    message(FATAL_ERROR "No GPUs found on the system when using 'native'")
-  endif()
-  set(LIBC_GPU_ARCHITECTURES ${detected_gpu_architectures})
-endif()
-message(STATUS "Building libc for the following GPU architecture(s): "
-               "${LIBC_GPU_ARCHITECTURES}")
-
 # Identify the program used to package multiple images into a single binary.
 find_program(LIBC_CLANG_OFFLOAD_PACKAGER
              NAMES clang-offload-packager NO_DEFAULT_PATH
@@ -87,49 +38,54 @@ else()
 endif()
 
 set(LIBC_GPU_TEST_ARCHITECTURE "" CACHE STRING "Architecture for the GPU tests")
-
-set(gpu_test_architecture "")
-if(LIBC_GPU_TEST_ARCHITECTURE)
-  set(gpu_test_architecture ${LIBC_GPU_TEST_ARCHITECTURE})
-  message(STATUS "Using user-specified GPU architecture for testing: "
-                 "'${gpu_test_architecture}'")
-elseif(detected_gpu_architectures)
-  list(GET detected_gpu_architectures 0 gpu_test_architecture)
-  message(STATUS "Using GPU architecture detected on the system for testing: "
-                 "'${gpu_test_architecture}'")
-else()
-  list(LENGTH LIBC_GPU_ARCHITECTURES n_gpu_archs)
-  if (${n_gpu_archs} EQUAL 1)
-    set(gpu_test_architecture ${LIBC_GPU_ARCHITECTURES})
-    message(STATUS "Using user-specified GPU architecture for testing: "
-                  "'${gpu_test_architecture}'")
-  else()
-    message(STATUS "No GPU architecture set for testing. GPU tests will not be "
-                  "availibe. Set 'LIBC_GPU_TEST_ARCHITECTURE' to override.")
-    return()
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  check_cxx_compiler_flag("-nogpulib -mcpu=native" PLATFORM_HAS_GPU)
+elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  # Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'.
+  # Using 'check_cxx_compiler_flag' does not work currently due to the link job.
+  find_program(LIBC_NVPTX_ARCH
+               NAMES nvptx-arch NO_DEFAULT_PATH
+               PATHS ${LLVM_BINARY_DIR}/bin)
+  if(LIBC_NVPTX_ARCH)
+    execute_process(COMMAND ${LIBC_NVPTX_ARCH}
+                    OUTPUT_VARIABLE arch_tool_output
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(arch_tool_output MATCHES "^sm_[0-9]+")
+      set(PLATFORM_HAS_GPU TRUE)
+    endif()
   endif()
 endif()
 
-if("${gpu_test_architecture}" IN_LIST all_amdgpu_architectures)
-  set(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU TRUE)
-  set(LIBC_GPU_TARGET_TRIPLE ${AMDGPU_TARGET_TRIPLE})
-  set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
-elseif("${gpu_test_architecture}" IN_LIST all_nvptx_architectures)
-  set(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX TRUE)
-  set(LIBC_GPU_TARGET_TRIPLE ${NVPTX_TARGET_TRIPLE})
-  set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
+set(gpu_test_architecture "")
+if(LIBC_GPU_TEST_ARCHITECTURE)
+  set(LIBC_GPU_TESTS_DISABLED FALSE)
+  set(gpu_test_architecture ${LIBC_GPU_TEST_ARCHITECTURE})
+  message(STATUS "Using user-specified GPU architecture for testing: "
+                 "'${gpu_test_architecture}'")
+elseif(PLATFORM_HAS_GPU)
+  set(LIBC_GPU_TESTS_DISABLED FALSE)
+  set(gpu_test_architecture "native")
+  message(STATUS "Using GPU architecture detected on the system for testing: "
+                 "'native'")
 else()
-  message(FATAL_ERROR "Unknown GPU architecture '${gpu_test_architecture}'")
+  set(LIBC_GPU_TESTS_DISABLED TRUE)
+  message(STATUS "No GPU architecture detected or provided, tests will not be "
+                 "built")
 endif()
+set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
+
+if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  # FIXME: This is a hack required to keep the CUDA package from trying to find
+  #        pthreads. We only link the CUDA driver, so this is unneeded.
+  add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
   find_package(CUDAToolkit QUIET)
   if(CUDAToolkit_FOUND)
     get_filename_component(LIBC_CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
   endif()
 endif()
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   # The AMDGPU environment uses different code objects to encode the ABI for
   # kernel calls and intrinsic functions. We want to specify this manually to
   # conform to whatever the test suite was built to handle.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index 71f5e7ba2039..79b9116c38ed 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -14,25 +14,25 @@ Building the GPU library
 
 LLVM's libc GPU support *must* be built with an up-to-date ``clang`` compiler
 due to heavy reliance on ``clang``'s GPU support. This can be done automatically
-using the ``LLVM_ENABLE_RUNTIMES=libc`` option. To enable libc for the GPU,
-enable the ``LIBC_GPU_BUILD`` option. By default, ``libcgpu.a`` will be built
-using every supported GPU architecture. To restrict the number of architectures
-build, either set ``LIBC_GPU_ARCHITECTURES`` to the list of desired
-architectures manually or use ``native`` to detect the GPUs on your system. A
-typical ``cmake`` configuration will look like this:
+using the LLVM runtimes support. The GPU build is done using cross-compilation 
+to the GPU architecture. This project currently supports AMD and NVIDIA GPUs 
+which can be targeted using the appropriate target name. The following 
+invocation will enable a cross-compiling build for the GPU architecture and 
+enable the ``libc`` project only for them. 
 
 .. code-block:: sh
 
   $> cd llvm-project  # The llvm-project checkout
   $> mkdir build
   $> cd build
-  $> cmake ../llvm -G Ninja                                \
-     -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt"        \
-     -DLLVM_ENABLE_RUNTIMES="libc;openmp"                  \
+  $> cmake ../llvm -G Ninja                                               \
+     -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt"                       \
+     -DLLVM_ENABLE_RUNTIMES="openmp"                                      \
      -DCMAKE_BUILD_TYPE=<Debug|Release>   \ # Select build type
-     -DLIBC_GPU_BUILD=ON                  \ # Build in GPU mode
-     -DLIBC_GPU_ARCHITECTURES=all         \ # Build all supported architectures
-     -DCMAKE_INSTALL_PREFIX=<PATH>        \ # Where 'libcgpu.a' will live
+     -DCMAKE_INSTALL_PREFIX=<PATH>        \ # Where 'libcgpu.a' will live 
+     -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=libc             \      
+     -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=libc               \
+     -DLLVM_RUNTIME_TARGETS=default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda
   $> ninja install
 
 Since we want to include ``clang``, ``lld`` and ``compiler-rt`` in our
@@ -40,13 +40,14 @@ toolchain, we list them in ``LLVM_ENABLE_PROJECTS``. To ensure ``libc`` is built
 using a compatible compiler and to support ``openmp`` offloading, we list them
 in ``LLVM_ENABLE_RUNTIMES`` to build them after the enabled projects using the
 newly built compiler. ``CMAKE_INSTALL_PREFIX`` specifies the installation
-directory in which to install the ``libcgpu.a`` library and headers along with
-LLVM. The generated headers will be placed in ``include/gpu-none-llvm``.
+directory in which to install the ``libcgpu-nvptx.a`` and ``libcgpu-amdgpu.a`` 
+libraries and headers along with LLVM. The generated headers will be placed in 
+``include/<gpu-triple>``.
 
 Usage
 =====
 
-Once the ``libcgpu.a`` static archive has been built it can be linked directly
+Once the static archive has been built it can be linked directly
 with offloading applications as a standard library. This process is described in
 the `clang documentation <https://clang.llvm.org/docs/OffloadingDesign.html>`_.
 This linking mode is used by the OpenMP toolchain, but is currently opt-in for
@@ -68,7 +69,7 @@ supported target device. The supported architectures can be seen using LLVM's
 
   OFFLOADING IMAGE [0]:
   kind            llvm ir
-  arch            gfx90a
+  arch            generic
   triple          amdgcn-amd-amdhsa
   producer        none
 
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index dc3c9b8e6328..9090b3bca01e 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -4,7 +4,7 @@ set(LIBC_INCLUDE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 include(LLVMLibCHeaderRules)
 
 # The GPU build wants to install files in the compiler's resource directory.
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   include(GetClangResourceDir)
 endif()
 
@@ -586,7 +586,7 @@ add_gen_header(
     .llvm-libc-types.wchar_t
 )
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu)
 
   add_gen_header(
@@ -638,7 +638,7 @@ foreach(target IN LISTS all_install_header_targets)
   # The GPU optionally provides the supported declarations externally so
   # offloading languages like CUDA and OpenMP know what is supported by libc. We
   # install these in the compiler's resource directory at a preset location.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND PACKAGE_VERSION)
+  if(LIBC_TARGET_OS_IS_GPU AND PACKAGE_VERSION)
     get_target_property(decls_file ${target} DECLS_FILE_PATH)
     if(NOT decls_file)
       continue()
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index c1a804232c1f..615f4270646f 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -2,11 +2,7 @@ set(libc_archive_targets "")
 set(libc_archive_names "")
 set(libc_archive_entrypoint_lists "")
 if(LLVM_LIBC_FULL_BUILD)
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    list(APPEND libc_archive_names cgpu mgpu)
-  else()
-    list(APPEND libc_archive_names c m)
-  endif()
+  list(APPEND libc_archive_names c m)
   list(APPEND libc_archive_targets libc libm)
   list(APPEND libc_archive_entrypoint_lists
        TARGET_LIBC_ENTRYPOINTS TARGET_LIBM_ENTRYPOINTS)
@@ -40,6 +36,27 @@ foreach(archive IN ZIP_LISTS
     endif()
   endif()
   list(APPEND added_archive_targets ${archive_1})
+
+  # Add the offloading version of the library for offloading languages. These
+  # are installed in the standard search path separate from the other libraries.
+  if(LIBC_TARGET_OS_IS_GPU)
+    set(libc_gpu_archive_target ${archive_1}gpu)
+    set(libc_gpu_archive_name ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE})
+
+    add_gpu_entrypoint_library(
+      ${libc_gpu_archive_target}
+      DEPENDS
+        ${${archive_2}}
+    )
+    set_target_properties(
+      ${libc_gpu_archive_target}
+      PROPERTIES
+        ARCHIVE_OUTPUT_NAME ${libc_gpu_archive_name}
+    )
+    set_target_properties(${libc_gpu_archive_target} PROPERTIES 
+                          ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR})
+    list(APPEND added_gpu_archive_targets ${libc_gpu_archive_target})
+  endif()
 endforeach()
 
 install(
@@ -48,6 +65,14 @@ install(
   COMPONENT libc
 )
 
+if(LIBC_TARGET_OS_IS_GPU)
+  install(
+    TARGETS ${added_gpu_archive_targets}
+    ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+    COMPONENT libc
+  )
+endif()
+
 if(NOT LIBC_TARGET_OS_IS_BAREMETAL)
   # For now we will disable libc-startup installation for baremetal. The
   # correct way to do it would be to make a hookable startup for baremetal
diff --git a/libc/src/__support/File/CMakeLists.txt b/libc/src/__support/File/CMakeLists.txt
index b3e4cc4b0277..b7c0612096aa 100644
--- a/libc/src/__support/File/CMakeLists.txt
+++ b/libc/src/__support/File/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(NOT (TARGET libc.src.__support.threads.mutex)
-   OR LIBC_TARGET_ARCHITECTURE_IS_GPU)
+   OR LIBC_TARGET_OS_IS_GPU)
   # Not all platforms have a mutex implementation. If mutex is unvailable,
   # we just skip everything about files.
   return()
diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 5a899215f4b6..d7ebd3cab7ab 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/src/__support/OSUtil/CMakeLists.txt b/libc/src/__support/OSUtil/CMakeLists.txt
index c19677582643..ca3b3bf1263e 100644
--- a/libc/src/__support/OSUtil/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/CMakeLists.txt
@@ -9,7 +9,7 @@ if(NOT TARGET ${target_os_util})
 endif()
 
 # The OSUtil is an object library in GPU mode.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_header_library(
     osutil
     HDRS
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
index b44a65b3732e..183fc6f8683e 100644
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 05ce51e8fc65..33dc1fc97c56 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_subdirectory(generic)
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
   add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
+elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  # TODO: We should split this into 'nvptx' and 'amdgpu' for the GPU build.
+  add_subdirectory(${LIBC_TARGET_OS})
 endif()
 
 function(add_math_entrypoint_object name)
@@ -8,6 +11,7 @@ function(add_math_entrypoint_object name)
   # that first and return early if we are able to add an alias target for the
   # machine specific implementation.
   get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.${name}" fq_machine_specific_target_name)
+  get_fq_target_name("${LIBC_TARGET_OS}.${name}" fq_os_specific_target_name)
   if(TARGET ${fq_machine_specific_target_name})
     add_entrypoint_object(
       ${name}
@@ -16,17 +20,25 @@ function(add_math_entrypoint_object name)
         .${LIBC_TARGET_ARCHITECTURE}.${name}
     )
     return()
+  elseif(TARGET ${fq_os_specific_target_name})
+    add_entrypoint_object(
+      ${name}
+      ALIAS
+      DEPENDS
+        .${LIBC_TARGET_OS}.${name}
+    )
+    return()
   endif()
 
   # The GPU optionally depends on vendor libraries. If we emitted one of these
   # entrypoints it means the user requested it and we should use it instead.
-  get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.vendor.${name}" fq_vendor_specific_target_name)
+  get_fq_target_name("${LIBC_TARGET_OS}.vendor.${name}" fq_vendor_specific_target_name)
   if(TARGET ${fq_vendor_specific_target_name})
     add_entrypoint_object(
       ${name}
       ALIAS
       DEPENDS
-        .${LIBC_TARGET_ARCHITECTURE}.vendor.${name}
+        .${LIBC_TARGET_OS}.vendor.${name}
       VENDOR
     )
     return()
diff --git a/libc/src/math/gpu/vendor/CMakeLists.txt b/libc/src/math/gpu/vendor/CMakeLists.txt
index f699ca103b5f..36087ade63bf 100644
--- a/libc/src/math/gpu/vendor/CMakeLists.txt
+++ b/libc/src/math/gpu/vendor/CMakeLists.txt
@@ -10,7 +10,6 @@ else()
                  "functions will be an external reference to the vendor libraries.")
 endif()
 
-find_package(CUDAToolkit QUIET)
 if(CUDAToolkit_FOUND)
   set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
   if (EXISTS ${libdevice_path})
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 380474ce2711..bb8e41606c5d 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -22,7 +22,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/generic)
 endif()
 
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index a4d51fb9a11e..ce08635df314 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -316,7 +316,7 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
     DEPENDS
       ${SCUDO_DEPS}
   )
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_entrypoint_external(
     calloc
   )
@@ -397,7 +397,7 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.abort
 )
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   add_entrypoint_object(
     malloc
     ALIAS
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 6daaf1998ea7..1c893280e8a3 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -501,7 +501,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_bcmp(bcmp_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512BW)
   add_bcmp(bcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_bcmp(bcmp)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_bcmp(bcmp)
 else()
   add_bcmp(bcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -530,7 +530,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_bzero(bzero_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
   add_bzero(bzero_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_bzero(bzero)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_bzero(bzero)
 else()
   add_bzero(bzero_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -562,7 +562,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memcmp(memcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memcmp(memcmp)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memcmp(memcmp)
 else()
   add_memcmp(memcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -598,7 +598,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                       MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
   add_memcpy(memcpy                   MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memcpy(memcpy)
 else()
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -632,7 +632,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memmove(memmove_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                         MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
   add_memmove(memmove                   MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memmove(memmove)
 else()
   add_memmove(memmove_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -667,7 +667,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                       MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
   add_memset(memset                   MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memset(memset)
 else()
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index fa7f69f19520..6f67fa9ff44f 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -28,33 +28,24 @@ function(add_startup_object name)
   )
 endfunction()
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
-  add_subdirectory(amdgpu)
-
-  add_startup_object(
-    crt1
-    ALIAS
-    DEPENDS
-      .amdgpu.crt1
-  )
-elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_subdirectory(nvptx)
-
-  add_startup_object(
-    crt1
-    ALIAS
-    DEPENDS
-      .nvptx.crt1
-  )
-else()
-  # Skip building the startup code if there are no supported GPUs.
-  message(STATUS "Skipping startup for gpu target, no GPUs were detected")
-  return()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+  add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
 endif()
 
+add_startup_object(
+  crt1
+  ALIAS
+  DEPENDS
+  .${LIBC_TARGET_ARCHITECTURE}.crt1
+)
+
 add_custom_target(libc-startup)
 set(startup_components crt1)
 foreach(target IN LISTS startup_components)
   set(fq_target_name libc.startup.gpu.${target})
   add_dependencies(libc-startup ${fq_target_name})
+  install(FILES $<TARGET_OBJECTS:${fq_target_name}>
+          DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
+          RENAME $<TARGET_PROPERTY:${fq_target_name},OUTPUT_NAME>
+          COMPONENT libc)
 endforeach()
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index c9d0ee2fd0e9..3ac104ee8ba9 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_startup_object(
   crt1
-  NO_GPU_BUNDLE # Compile this file directly without special GPU handling.
   SRC
     start.cpp
   DEPENDS
@@ -11,17 +10,5 @@ add_startup_object(
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
-    -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION} # Manually set the ABI.
 )
 get_fq_target_name(crt1 fq_name)
-
-# Ensure that clang uses the correct linker for this object type.
-target_link_libraries(
-  ${fq_name}
-  PUBLIC
-  "-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}"
-  "--target=${LIBC_GPU_TARGET_TRIPLE}"
-  "-flto"
-  "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0"
-  "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}"
-)
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index 23a54516cc98..3ac104ee8ba9 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_startup_object(
   crt1
-  NO_GPU_BUNDLE # Compile this file directly without special GPU handling.
   SRC
     start.cpp
   DEPENDS
@@ -13,11 +12,3 @@ add_startup_object(
     -fno-builtin
 )
 get_fq_target_name(crt1 fq_name)
-
-# Ensure that clang uses the correct linker for this object type.
-target_link_libraries(${fq_name}
-  PUBLIC
-  "-march=${LIBC_GPU_TARGET_ARCHITECTURE}"
-  "--target=${LIBC_GPU_TARGET_TRIPLE}"
-  "--cuda-path=${LIBC_CUDA_ROOT}"
-)
diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt
index f22f2b183aca..745a9a04b4af 100644
--- a/libc/test/CMakeLists.txt
+++ b/libc/test/CMakeLists.txt
@@ -8,9 +8,9 @@ add_custom_target(libc-long-running-tests)
 
 add_subdirectory(UnitTest)
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND
-   (NOT TARGET libc.utils.gpu.loader OR NOT TARGET libc.startup.gpu.crt1))
-  message(WARNING "Cannot build libc GPU tests, missing loader implementation")
+if(LIBC_TARGET_OS_IS_GPU AND
+   (NOT TARGET libc.utils.gpu.loader OR LIBC_GPU_TESTS_DISABLED))
+  message(WARNING "Cannot build libc GPU tests, missing loader or architecture")
   return()
 endif()
 
diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt
index dca4c5a6f1b1..4f31f10b29f0 100644
--- a/libc/test/IntegrationTest/CMakeLists.txt
+++ b/libc/test/IntegrationTest/CMakeLists.txt
@@ -1,21 +1,5 @@
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(TEST_COMPILE_FLAGS
-    -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
-    -emit-llvm # AMDGPU's intermediate object file format is bitcode.
-    --target=${LIBC_GPU_TARGET_TRIPLE}
-    -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION} # Manually set the ABI.
-  )
-elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-  set(TEST_COMPILE_FLAGS
-    -march=${LIBC_GPU_TARGET_ARCHITECTURE}
-    --target=${LIBC_GPU_TARGET_TRIPLE}
-    --cuda-path=${LIBC_CUDA_ROOT}
-  )
-endif()
-
 add_object_library(
   test
-  NO_GPU_BUNDLE # Compile this file directly without special GPU handling.
   SRCS
     test.cpp
   COMPILE_OPTIONS
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 4a615d4bd5e1..4668f0061975 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -12,7 +12,7 @@ function(add_unittest_framework_library name)
   endif()
 
   # The Nvidia 'nvlink' linker does not support static libraries.
-  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     set(library_type OBJECT)
   else()
     set(library_type STATIC)
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 9801621e6b39..53fa1323d18b 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_custom_target(libc-support-tests)
 
 # FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_libc_test(
     blockstore_test
     SUITE
@@ -76,7 +76,7 @@ add_libc_test(
 )
 
 # The GPU does not support varargs currently.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_libc_test(
     arg_list_test
     SUITE
@@ -88,8 +88,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
   )
 endif()
 
-# FIXME: Crash in NVPTX target lowering for calls
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_libc_test(
     uint_test
     SUITE
@@ -159,29 +158,33 @@ add_libc_test(
     libc.src.__support.memory_size
 )
 
-add_executable(
-  libc_str_to_float_comparison_test
-  str_to_float_comparison_test.cpp
-)
+# FIXME: We shouldn't have regular executables created because we could be
+#        cross-compiling the tests and running through an emulator.
+if(NOT LIBC_TARGET_OS_IS_GPU)
+  add_executable(
+    libc_str_to_float_comparison_test
+    str_to_float_comparison_test.cpp
+  )
 
-target_link_libraries(libc_str_to_float_comparison_test
-  PRIVATE
-    "${LIBC_TARGET}"
-)
+  target_link_libraries(libc_str_to_float_comparison_test
+    PRIVATE
+      "${LIBC_TARGET}"
+  )
 
-add_executable(
-  libc_system_str_to_float_comparison_test
-  str_to_float_comparison_test.cpp
-)
+  add_executable(
+    libc_system_str_to_float_comparison_test
+    str_to_float_comparison_test.cpp
+  )
 
-set(float_test_file ${CMAKE_CURRENT_SOURCE_DIR}/str_to_float_comparison_data.txt)
+  set(float_test_file ${CMAKE_CURRENT_SOURCE_DIR}/str_to_float_comparison_data.txt)
 
-add_custom_command(TARGET libc_str_to_float_comparison_test
-                   POST_BUILD
-                   COMMAND $<TARGET_FILE:libc_str_to_float_comparison_test> ${float_test_file}
-                   DEPENDS ${float_test_file}
-                   COMMENT "Test the strtof and strtod implementations against precomputed results." 
-                   VERBATIM)
+  add_custom_command(TARGET libc_str_to_float_comparison_test
+                     POST_BUILD
+                     COMMAND $<TARGET_FILE:libc_str_to_float_comparison_test> ${float_test_file}
+                     DEPENDS ${float_test_file}
+                     COMMENT "Test the strtof and strtod implementations against precomputed results." 
+                     VERBATIM)
+endif()
 
 add_subdirectory(CPP)
 add_subdirectory(File)
diff --git a/libc/test/src/__support/CPP/CMakeLists.txt b/libc/test/src/__support/CPP/CMakeLists.txt
index 6927579289bc..d7f332f5b0fb 100644
--- a/libc/test/src/__support/CPP/CMakeLists.txt
+++ b/libc/test/src/__support/CPP/CMakeLists.txt
@@ -64,7 +64,7 @@ add_libc_test(
 
 
 # This test fails with invalid address space operations on sm_60
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_libc_test(
     atomic_test
     SUITE
diff --git a/libc/test/src/__support/File/CMakeLists.txt b/libc/test/src/__support/File/CMakeLists.txt
index f193480c60c2..9191469b4927 100644
--- a/libc/test/src/__support/File/CMakeLists.txt
+++ b/libc/test/src/__support/File/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT (TARGET libc.src.__support.threads.mutex) OR LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT (TARGET libc.src.__support.threads.mutex) OR LIBC_TARGET_OS_IS_GPU)
   # Not all platforms have a mutex implementation. If mutex is unvailable,
   # we just skip everything about files. The GPU does not currently support
   # files as well.
diff --git a/libc/test/src/errno/CMakeLists.txt b/libc/test/src/errno/CMakeLists.txt
index 633d46a1f5f8..b73962fb4de4 100644
--- a/libc/test/src/errno/CMakeLists.txt
+++ b/libc/test/src/errno/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LLVM_LIBC_FULL_BUILD OR LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LLVM_LIBC_FULL_BUILD OR LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 8c105515e352..81d2e1e55b55 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1,10 +1,14 @@
 add_custom_target(libc-math-unittests)
 
-add_library(
-  libc_math_test_utils
-  RandUtils.cpp
-  RandUtils.h
-)
+# FIXME: We shouldn't have regular libraries created because we could be
+#        cross-compiling the tests and running through an emulator.
+if(NOT LIBC_TARGET_OS_IS_GPU)
+  add_library(
+    libc_math_test_utils
+    RandUtils.cpp
+    RandUtils.h
+  )
+endif()
 
 add_fp_unittest(
   cosf_test
@@ -755,7 +759,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     ilogb_test
     SUITE
@@ -986,7 +990,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_fp_unittest(
     fminf_test
     SUITE
@@ -1231,7 +1235,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     nextafter_test
     SUITE
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 1824c672cb97..2d24b5a76b01 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -819,7 +819,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     ilogb_test
     SUITE
@@ -1073,7 +1073,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_fp_unittest(
     fminf_test
     SUITE
@@ -1417,7 +1417,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     nextafter_test
     SUITE
@@ -1465,7 +1465,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently spurious for the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_fp_unittest(
     nexttoward_test
     SUITE
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 8db2293ab74a..93c21aa994ef 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -430,7 +430,7 @@ add_libc_test(
 # Create an output directory for any temporary test files.
 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/testdata)
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index da07dbbe7977..5826cfe8d4ca 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -55,7 +55,7 @@ add_libc_test(
 )
 
 # This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     strtod_test
     SUITE
@@ -127,7 +127,7 @@ add_libc_test(
 )
 
 # This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_libc_test(
     strtold_test
     SUITE
@@ -339,7 +339,7 @@ if(LLVM_LIBC_FULL_BUILD)
   )
 
   # Only the GPU has an in-tree 'malloc' implementation.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if(LIBC_TARGET_OS_IS_GPU)
     add_libc_test(
       malloc_test
       HERMETIC_TEST_ONLY
diff --git a/libc/test/utils/UnitTest/CMakeLists.txt b/libc/test/utils/UnitTest/CMakeLists.txt
index 6f61e0ffefb0..3b917e06cde2 100644
--- a/libc/test/utils/UnitTest/CMakeLists.txt
+++ b/libc/test/utils/UnitTest/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt
index 9754dcf3854a..7bf02a4af7de 100644
--- a/libc/utils/CMakeLists.txt
+++ b/libc/utils/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(LLVM_INCLUDE_TESTS)
   add_subdirectory(MPFRWrapper)
 endif()
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   add_subdirectory(gpu)
 endif()
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index adc073c9a91f..6f44ca0d786c 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -24,6 +24,6 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     target_link_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/lib)
   endif()
   target_link_libraries(libcMPFRWrapper PUBLIC LibcFPTestHelpers.unit LibcTest.unit mpfr gmp)
-elseif(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(NOT LIBC_TARGET_OS_IS_GPU)
   message(WARNING "Math tests using MPFR will be skipped.")
 endif()
diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt
index 7c15f36052cf..4d1ebcfb9f8e 100644
--- a/libc/utils/gpu/CMakeLists.txt
+++ b/libc/utils/gpu/CMakeLists.txt
@@ -1,2 +1,4 @@
 add_subdirectory(server)
-add_subdirectory(loader)
+if(LIBC_TARGET_OS_IS_GPU)
+  add_subdirectory(loader)
+endif()
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index f195b887c9af..189460bb02e6 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -1,31 +1,30 @@
 add_library(gpu_loader OBJECT Main.cpp)
+
 target_include_directories(gpu_loader PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${LIBC_SOURCE_DIR}/include
   ${LIBC_SOURCE_DIR}
 )
 
+# This utility needs to be compiled for the host system when cross compiling.
+if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
+  target_compile_options(gpu_loader PUBLIC --target=${LLVM_HOST_TRIPLE})
+  target_link_libraries(gpu_loader PUBLIC "--target=${LLVM_HOST_TRIPLE}")
+endif()
+
 find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-if(hsa-runtime64_FOUND)
+if(hsa-runtime64_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   add_subdirectory(amdgpu)
-else()
+elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   message(STATUS "Skipping HSA loader for gpu target, no HSA was detected")
 endif()
 
-find_package(CUDAToolkit QUIET)
 # The CUDA loader requires LLVM to traverse the ELF image for symbols.
 find_package(LLVM QUIET)
-if(CUDAToolkit_FOUND AND LLVM_FOUND AND
-   "${CUDAToolkit_VERSION}" VERSION_GREATER_EQUAL "11.2")
+if(CUDAToolkit_FOUND AND LLVM_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_subdirectory(nvptx)
-else()
-  if("${CUDAToolkit_VERSION}" VERSION_LESS "11.2")
-    message(WARNING 
-      "Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later.
-       Found CUDA Version ${CUDAToolkit_VERSION}")
-  else()
-    message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
-  endif()
+elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
 endif()
 
 # Add a custom target to be used for testing.
@@ -37,20 +36,31 @@ if(LIBC_GPU_LOADER_EXECUTABLE)
     PROPERTIES
       EXECUTABLE "${LIBC_GPU_LOADER_EXECUTABLE}"
   )
-elseif(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+elseif(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   add_custom_target(libc.utils.gpu.loader)
-  add_dependencies(libc.utils.gpu.loader amdhsa_loader)
+  add_dependencies(libc.utils.gpu.loader amdhsa-loader)
   set_target_properties(
     libc.utils.gpu.loader
     PROPERTIES
-      EXECUTABLE "$<TARGET_FILE:amdhsa_loader>"
+      TARGET amdhsa-loader
+      EXECUTABLE "$<TARGET_FILE:amdhsa-loader>"
   )
-elseif(TARGET nvptx_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+elseif(TARGET nvptx-loader AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_custom_target(libc.utils.gpu.loader)
-  add_dependencies(libc.utils.gpu.loader nvptx_loader)
+  add_dependencies(libc.utils.gpu.loader nvptx-loader)
   set_target_properties(
     libc.utils.gpu.loader
     PROPERTIES
-      EXECUTABLE "$<TARGET_FILE:nvptx_loader>"
+      TARGET nvptx-loader
+      EXECUTABLE "$<TARGET_FILE:nvptx-loader>"
   )
 endif()
+
+if(TARGET libc.utils.gpu.loader)
+  get_target_property(gpu_loader_tgt libc.utils.gpu.loader "TARGET")
+  if(gpu_loader_tgt)
+    install(TARGETS ${gpu_loader_tgt}
+            DESTINATION ${CMAKE_INSTALL_BINDIR}
+            COMPONENT libc)
+  endif()
+endif()
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index 8e9c9a2bdc7d..b99319f50401 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -1,7 +1,7 @@
-add_executable(amdhsa_loader Loader.cpp)
-add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc)
+add_executable(amdhsa-loader Loader.cpp)
+add_dependencies(amdhsa-loader libc.src.__support.RPC.rpc)
 
-target_link_libraries(amdhsa_loader
+target_link_libraries(amdhsa-loader
   PRIVATE
   hsa-runtime64::hsa-runtime64
   gpu_loader
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
index 0c76c49fa309..e76362a1e8cc 100644
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -1,11 +1,11 @@
-add_executable(nvptx_loader Loader.cpp)
-add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
+add_executable(nvptx-loader Loader.cpp)
+add_dependencies(nvptx-loader libc.src.__support.RPC.rpc)
 
 if(NOT LLVM_ENABLE_RTTI)
-  target_compile_options(nvptx_loader PRIVATE -fno-rtti)
+  target_compile_options(nvptx-loader PRIVATE -fno-rtti)
 endif()
-target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})
-target_link_libraries(nvptx_loader
+target_include_directories(nvptx-loader PRIVATE ${LLVM_INCLUDE_DIRS})
+target_link_libraries(nvptx-loader
   PRIVATE
   gpu_loader
   llvmlibc_rpc_server
diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt
index 3d9b2bcab4db..94cdfe5bf652 100644
--- a/libc/utils/gpu/server/CMakeLists.txt
+++ b/libc/utils/gpu/server/CMakeLists.txt
@@ -5,12 +5,21 @@ target_include_directories(llvmlibc_rpc_server PRIVATE ${LIBC_SOURCE_DIR})
 target_include_directories(llvmlibc_rpc_server PUBLIC ${LIBC_SOURCE_DIR}/include)
 target_include_directories(llvmlibc_rpc_server PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
+
 # Ignore unsupported clang attributes if we're using GCC.
 target_compile_options(llvmlibc_rpc_server PUBLIC
                        $<$<CXX_COMPILER_ID:GNU>:-Wno-attributes>)
 target_compile_definitions(llvmlibc_rpc_server PUBLIC
                            LIBC_NAMESPACE=${LIBC_NAMESPACE})
 
+# This utility needs to be compiled for the host system when cross compiling.
+if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
+  target_compile_options(llvmlibc_rpc_server PUBLIC 
+                         --target=${LLVM_HOST_TRIPLE})
+  target_link_libraries(llvmlibc_rpc_server PUBLIC
+                        "--target=${LLVM_HOST_TRIPLE}")
+endif()
+
 # Install the server and associated header.
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/rpc_server.h
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm/
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index dbd5fbf226bd..f5f7d3f3253f 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -175,7 +175,9 @@ else()
   foreach(_name ${LLVM_RUNTIME_TARGETS})
     if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
       set(NEED_LIBC_HDRGEN TRUE)
-      break()
+      if("${_name}" STREQUAL "amdgcn-amd-amdhsa" OR "${_name}" STREQUAL "nvptx64-nvidia-cuda")
+        set(LLVM_LIBC_GPU_BUILD ON)
+      endif()
     endif()
   endforeach()
 endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 486df22c2c1b..4257083e53ad 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -120,6 +120,13 @@ if( LLVM_ENABLE_ASSERTIONS )
   endif()
 endif()
 
+# If we are targeting a GPU architecture we want to ignore all the standard
+# flag handling.
+if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
+   "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx64")
+  return()
+endif()
+
 if(LLVM_ENABLE_EXPENSIVE_CHECKS)
   add_compile_definitions(EXPENSIVE_CHECKS)
 
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 8c48d85a4346..9b5e758b6ede 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -199,7 +199,7 @@ foreach(entry ${runtimes})
     list(APPEND prefixes "LLVM_LIBC")
     list(APPEND prefixes "LIBC_")
     # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LLVM_LIBC_GPU_BUILD)
       list(APPEND prefixes "CUDA")
     endif()
   endif()
@@ -424,7 +424,7 @@ if(runtimes)
     endforeach()
   endif()
   if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND
-      (LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES))
+      (LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD))
     if(LIBC_HDRGEN_EXE)
       set(hdrgen_exe ${LIBC_HDRGEN_EXE})
     else()
@@ -441,7 +441,12 @@ if(runtimes)
     set(libc_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}"
                         "-DLLVM_LIBC_FULL_BUILD=ON")
     list(APPEND extra_deps ${hdrgen_deps})
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LLVM_LIBC_GPU_BUILD)
+      list(APPEND libc_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
+      # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
+      if(CUDAToolkit_ROOT)
+        list(APPEND libc_cmake_args "-DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+      endif()
       foreach(dep clang-offload-packager nvptx-arch amdgpu-arch)
         if(TARGET ${dep})
           list(APPEND extra_deps ${dep})
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 17e61d0bc47d..a74eff0c0beb 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -119,14 +119,7 @@ endif()
 
 pythonize_bool(LIBOMPTARGET_OMPT_SUPPORT)
 
-# Check if this build supports the GPU libc.
-set(LIBC_GPU_SUPPORT FALSE)
-if("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND (LIBC_GPU_BUILD OR
-                                            LIBC_GPU_ARCHITECTURES))
-  set(LIBC_GPU_SUPPORT TRUE)
-endif()
-
-set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LIBC_GPU_SUPPORT} CACHE BOOL
+set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LLVM_LIBC_GPU_BUILD} CACHE BOOL
     "Libomptarget support for the GPU libc")
 pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT)
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
index 8ae3ff2a6d29..085d44307165 100644
--- a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
@@ -73,8 +73,12 @@ elseif(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
   find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server
                PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
   if(llvmlibc_rpc_server)
-		target_link_libraries(PluginCommon PRIVATE llvmlibc_rpc_server)
+		target_link_libraries(PluginCommon PRIVATE ${llvmlibc_rpc_server})
 		target_compile_definitions(PluginCommon PRIVATE LIBOMPTARGET_RPC_SUPPORT)
+    # We may need to get the headers directly from the 'libc' source directory.
+    target_include_directories(PluginCommon PRIVATE
+                               ${CMAKE_SOURCE_DIR}/../libc/utils/gpu/server
+                               ${CMAKE_SOURCE_DIR}/../libc/include)
   endif()
 endif()
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
index 54aced11b31c..cb6a5086bc4d 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
@@ -18,7 +18,8 @@
 #if __has_include(<gpu-none-llvm/rpc_server.h>)
 #include <gpu-none-llvm/rpc_server.h>
 #elif defined(LIBOMPTARGET_RPC_SUPPORT)
-#include <rpc_server.h>
+// Just pull this out of the source if available.
+#include "rpc_server.h"
 #endif
 
 using namespace llvm;
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 565556e64ff2..6c590603079c 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -180,8 +180,12 @@ def remove_suffix_if_present(name):
 
 def add_libraries(source):
     if config.libomptarget_has_libc:
-        return source + " " + config.llvm_library_dir + "/libcgpu.a " + \
-               config.llvm_library_intdir + "/libomptarget.devicertl.a"
+        if config.libomptarget_current_target.startswith('nvptx'):
+            return source + " " + config.llvm_library_dir + "/libcgpu-nvptx.a " + \
+                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
+        elif config.libomptarget_current_target.startswith('amdgcn'):
+            return source + " " + config.llvm_library_dir + "/libcgpu-amdgpu.a " + \
+                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
     return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
 
 # substitutions