llvm-project/clang/test/CodeGenCUDA/memcpy-libcall.cu
Alex MacLean 369891b674
[NVPTX] use untyped loads and stores where ever possible (#137698)
In most cases, the type information attached to load and store
instructions is meaningless and inconsistently applied. We can usually
use ".b" loads and avoid the complexity of trying to assign the correct
type. The one expectation is sign-extending load, which will continue to
use ".s" to ensure the sign extension into a larger register is done
correctly.
2025-05-10 08:26:26 -07:00

65 lines
1.9 KiB
Plaintext

// REQUIRES: x86-registered-target
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \
// RUN: -O3 -S %s -o - | FileCheck -check-prefix=PTX %s
// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \
// RUN: -Os -S %s -o - | FileCheck -check-prefix=PTX %s
#include "Inputs/cuda.h"
// PTX-LABEL: .func _Z12copy_genericPvPKv(
void __device__ copy_generic(void *dest, const void *src) {
__builtin_memcpy(dest, src, 32);
// PTX: ld.b8
// PTX: st.b8
}
// PTX-LABEL: .entry _Z11copy_globalPvS_(
void __global__ copy_global(void *dest, void * src) {
__builtin_memcpy(dest, src, 32);
// PTX: ld.global.b8
// PTX: st.global.b8
}
struct S {
int data[8];
};
// PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_(
void __global__ copy_param_to_global(S *global, S param) {
__builtin_memcpy(global, &param, sizeof(S));
// PTX: ld.param.b32
// PTX: st.global.b32
}
// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_(
void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local,
S param) {
__builtin_memcpy(local, &param, sizeof(S));
// PTX: ld.param.b32
// PTX: st.local.b32
}
// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_(
void __device__ copy_local_to_generic(S *generic,
__attribute__((address_space(5))) S *src) {
__builtin_memcpy(generic, src, sizeof(S));
// PTX: ld.local.b32
// PTX: st.b32
}
__shared__ S shared;
// PTX-LABEL: .entry _Z20copy_param_to_shared1S(
void __global__ copy_param_to_shared( S param) {
__builtin_memcpy(&shared, &param, sizeof(S));
// PTX: ld.param.b32
// PTX: st.shared.b32
}
void __device__ copy_shared_to_generic(S *generic) {
__builtin_memcpy(generic, &shared, sizeof(S));
// PTX: ld.shared.b32
// PTX: st.b32
}