diff --git a/libclc/clc/include/clc/shared/clc_less_aligned_types.h b/libclc/clc/include/clc/shared/clc_less_aligned_types.h
new file mode 100644
index 000000000000..73e436147ce0
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_less_aligned_types.h
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines types to be used with (CLC) vstore and vload functions. These are
+// vector types whose alignment is that of their respective scalar types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__
+#define __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__
+
+#define __CLC_BODY <clc/shared/clc_less_aligned_types.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/shared/clc_less_aligned_types.inc>
+#include <clc/math/gentype.inc>
+
+#endif // __CLC_SHARED_CLC_LESS_ALIGNED_TYPES_H__
diff --git a/libclc/clc/include/clc/shared/clc_less_aligned_types.inc b/libclc/clc/include/clc/shared/clc_less_aligned_types.inc
new file mode 100644
index 000000000000..45d69ea72fc0
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_less_aligned_types.inc
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines types to be used with (CLC) vstore and vload functions. These are
+// vector types whose alignment is that of their respective scalar types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+typedef __CLC_GENTYPE __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE);
+
+#else
+
+typedef __CLC_GENTYPE __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE)
+    __attribute__((aligned(sizeof(__CLC_SCALAR_GENTYPE))));
+
+#endif
diff --git a/libclc/clc/include/clc/shared/clc_vload.h b/libclc/clc/include/clc/shared/clc_vload.h
new file mode 100644
index 000000000000..c3dbe0696cc1
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_vload.h
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_SHARED_CLC_VLOAD_H__
+#define __CLC_SHARED_CLC_VLOAD_H__
+
+#include <clc/shared/clc_less_aligned_types.h>
+
+#define __CLC_BODY <clc/shared/clc_vload.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/shared/clc_vload.inc>
+#include <clc/math/gentype.inc>
+
+#endif // __CLC_SHARED_CLC_VLOAD_H__
diff --git a/libclc/clc/include/clc/shared/clc_vload.inc b/libclc/clc/include/clc/shared/clc_vload.inc
new file mode 100644
index 000000000000..8f3b00ec0445
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_vload.inc
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define CLC_VLOAD_NAME __CLC_XCONCAT(__clc_vload, __CLC_VECSIZE)
+#define CLC_VLOAD_HALF_NAME __CLC_XCONCAT(__clc_vload_half, __CLC_VECSIZE)
+#define CLC_VLOADA_HALF_NAME __CLC_XCONCAT(__clc_vloada_half, __CLC_VECSIZE)
+
+#ifndef __CLC_SCALAR
+
+#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE)
+
+#define CLC_VLOAD_DECL(ADDRSPACE)                                              \
+  _CLC_OVERLOAD _CLC_DECL CLC_VLOAD_TY CLC_VLOAD_NAME(                         \
+      size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x);
+
+CLC_VLOAD_DECL(__private)
+CLC_VLOAD_DECL(__local)
+CLC_VLOAD_DECL(__constant)
+CLC_VLOAD_DECL(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+CLC_VLOAD_DECL(__generic)
+#endif
+
+#undef CLC_VLOAD_DECL
+#undef CLC_VLOAD_TY
+
+#endif // __CLC_SCALAR
+
+// vload_half and vloada_half are available even if cl_khr_fp16 is unavailable.
+// Declare these functions when working on float types, which we know are
+// always available.
+#ifdef __CLC_FPSIZE
+#if __CLC_FPSIZE == 32
+
+#define CLC_VLOAD_HALF_DECL(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOAD_HALF_NAME(                   \
+      size_t offset, const ADDRSPACE half *mem);                               \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOADA_HALF_NAME(                  \
+      size_t offset, const ADDRSPACE half *mem);
+
+CLC_VLOAD_HALF_DECL(__private)
+CLC_VLOAD_HALF_DECL(__local)
+CLC_VLOAD_HALF_DECL(__constant)
+CLC_VLOAD_HALF_DECL(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+CLC_VLOAD_HALF_DECL(__generic)
+#endif
+
+#undef CLC_VLOAD_HALF_DECL
+
+#endif
+#endif
+
+#undef CLC_VLOAD_NAME
+#undef CLC_VLOAD_HALF_NAME
+#undef CLC_VLOADA_HALF_NAME
diff --git a/libclc/clc/include/clc/shared/clc_vstore.h b/libclc/clc/include/clc/shared/clc_vstore.h
new file mode 100644
index 000000000000..647dc7da1afb
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_vstore.h
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_SHARED_CLC_VSTORE_H__
+#define __CLC_SHARED_CLC_VSTORE_H__
+
+#include <clc/shared/clc_less_aligned_types.h>
+
+#define __CLC_BODY <clc/shared/clc_vstore.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/shared/clc_vstore.inc>
+#include <clc/math/gentype.inc>
+
+#endif // __CLC_SHARED_CLC_VSTORE_H__
diff --git a/libclc/clc/include/clc/shared/clc_vstore.inc b/libclc/clc/include/clc/shared/clc_vstore.inc
new file mode 100644
index 000000000000..38d54b2f1b67
--- /dev/null
+++ b/libclc/clc/include/clc/shared/clc_vstore.inc
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE)
+#define CLC_VSTORE_NAME __CLC_XCONCAT(__clc_vstore, __CLC_VECSIZE)
+#define CLC_VSTORE_HALF_NAME(x)                                                \
+  __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstore_half, __CLC_VECSIZE), x)
+#define CLC_VSTOREA_HALF_NAME(x)                                               \
+  __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstorea_half, __CLC_VECSIZE), x)
+
+#ifndef __CLC_SCALAR
+
+#define CLC_VSTORE_DECL(ADDRSPACE)                                             \
+  _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_NAME(                                \
+      CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p);
+
+CLC_VSTORE_DECL(__private)
+CLC_VSTORE_DECL(__local)
+CLC_VSTORE_DECL(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+CLC_VSTORE_DECL(__generic)
+#endif
+
+#undef CLC_VSTORE_DECL
+
+#endif // __CLC_SCALAR
+
+// vstore_half and vstorea_half are available even if cl_khr_fp16 is
+// unavailable.
+#ifdef __CLC_FPSIZE
+#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
+
+#define CLC_VSTORE_HALF_DECL(ADDRSPACE, SUFFIX)                                \
+  _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_HALF_NAME(SUFFIX)(                   \
+      CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p);                   \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DECL void CLC_VSTOREA_HALF_NAME(SUFFIX)(                  \
+      CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p);
+
+#define CLC_VSTORE_HALF_DECL_ALL_MODES(ADDRSPACE)                              \
+  CLC_VSTORE_HALF_DECL(ADDRSPACE, )                                            \
+  CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtz)                                        \
+  CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtn)                                        \
+  CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtp)                                        \
+  CLC_VSTORE_HALF_DECL(ADDRSPACE, _rte)
+
+CLC_VSTORE_HALF_DECL_ALL_MODES(__private)
+CLC_VSTORE_HALF_DECL_ALL_MODES(__local)
+CLC_VSTORE_HALF_DECL_ALL_MODES(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+CLC_VSTORE_HALF_DECL_ALL_MODES(__generic)
+#endif
+
+#undef CLC_VSTORE_HALF_DECL
+#undef CLC_VSTORE_HALF_DECL_ALL_MODES
+
+#endif
+#endif
+
+#undef CLC_VSTORE_TY
+#undef CLC_VSTORE_NAME
+#undef CLC_VSTORE_HALF_NAME
+#undef CLC_VSTOREA_HALF_NAME
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index a8a906159e28..49c7ca636f24 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -147,3 +147,5 @@ relational/clc_signbit.cl
 shared/clc_clamp.cl
 shared/clc_max.cl
 shared/clc_min.cl
+shared/clc_vload.cl
+shared/clc_vstore.cl
diff --git a/libclc/clc/lib/generic/shared/clc_vload.cl b/libclc/clc/lib/generic/shared/clc_vload.cl
new file mode 100644
index 000000000000..e4003e4a9673
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_vload.cl
@@ -0,0 +1,130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/internal/clc.h>
+#include <clc/shared/clc_vload.h>
+
+#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE)                                 \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 __clc_vload2(                            \
+      size_t offset, const ADDR_SPACE PRIM_TYPE *x) {                          \
+    return *(                                                                  \
+        (const ADDR_SPACE less_aligned_##PRIM_TYPE##2 *)(&x[2 * offset]));     \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 __clc_vload3(                            \
+      size_t offset, const ADDR_SPACE PRIM_TYPE *x) {                          \
+    PRIM_TYPE##2 vec =                                                         \
+        *((const ADDR_SPACE less_aligned_##PRIM_TYPE##2 *)(&x[3 * offset]));   \
+    return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]);                  \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 __clc_vload4(                            \
+      size_t offset, const ADDR_SPACE PRIM_TYPE *x) {                          \
+    return *(                                                                  \
+        (const ADDR_SPACE less_aligned_##PRIM_TYPE##4 *)(&x[4 * offset]));     \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 __clc_vload8(                            \
+      size_t offset, const ADDR_SPACE PRIM_TYPE *x) {                          \
+    return *(                                                                  \
+        (const ADDR_SPACE less_aligned_##PRIM_TYPE##8 *)(&x[8 * offset]));     \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 __clc_vload16(                          \
+      size_t offset, const ADDR_SPACE PRIM_TYPE *x) {                          \
+    return *(                                                                  \
+        (const ADDR_SPACE less_aligned_##PRIM_TYPE##16 *)(&x[16 * offset]));   \
+  }
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE
+#else
+// The generic address space isn't available, so make the macro do nothing
+#define VLOAD_VECTORIZE_GENERIC(X, Y)
+#endif
+
+#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE)                                \
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private)                             \
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local)                               \
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant)                            \
+  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global)                              \
+  VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic)
+
+#define VLOAD_TYPES()                                                          \
+  VLOAD_ADDR_SPACES(char)                                                      \
+  VLOAD_ADDR_SPACES(uchar)                                                     \
+  VLOAD_ADDR_SPACES(short)                                                     \
+  VLOAD_ADDR_SPACES(ushort)                                                    \
+  VLOAD_ADDR_SPACES(int)                                                       \
+  VLOAD_ADDR_SPACES(uint)                                                      \
+  VLOAD_ADDR_SPACES(long)                                                      \
+  VLOAD_ADDR_SPACES(ulong)                                                     \
+  VLOAD_ADDR_SPACES(float)
+
+VLOAD_TYPES()
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+VLOAD_ADDR_SPACES(double)
+#endif
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+VLOAD_ADDR_SPACES(half)
+#endif
+
+/* vload_half are legal even without cl_khr_fp16 */
+/* no vload_half for double */
+#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]);
+#define VEC_LOAD2(val, AS)                                                     \
+  VEC_LOAD1(val.lo, AS)                                                        \
+  VEC_LOAD1(val.hi, AS)
+#define VEC_LOAD3(val, AS)                                                     \
+  VEC_LOAD1(val.s0, AS)                                                        \
+  VEC_LOAD1(val.s1, AS)                                                        \
+  VEC_LOAD1(val.s2, AS)
+#define VEC_LOAD4(val, AS)                                                     \
+  VEC_LOAD2(val.lo, AS)                                                        \
+  VEC_LOAD2(val.hi, AS)
+#define VEC_LOAD8(val, AS)                                                     \
+  VEC_LOAD4(val.lo, AS)                                                        \
+  VEC_LOAD4(val.hi, AS)
+#define VEC_LOAD16(val, AS)                                                    \
+  VEC_LOAD8(val.lo, AS)                                                        \
+  VEC_LOAD8(val.hi, AS)
+
+#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)                        \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_vload_half##SUFFIX(size_t offset,          \
+                                                       const AS half *mem) {   \
+    offset *= VEC_SIZE;                                                        \
+    TYPE __tmp;                                                                \
+    VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp;                                \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_vloada_half##SUFFIX(size_t offset,         \
+                                                        const AS half *mem) {  \
+    offset *= OFFSET_SIZE;                                                     \
+    TYPE __tmp;                                                                \
+    VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp;                                \
+  }
+
+#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)                          \
+  __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
+
+#define __CLC_BODY "clc_vload_half.inc"
+#include <clc/math/gentype.inc>
+#undef FUNC
+#undef __FUNC
+#undef VEC_LOAD16
+#undef VEC_LOAD8
+#undef VEC_LOAD4
+#undef VEC_LOAD3
+#undef VEC_LOAD2
+#undef VEC_LOAD1
+#undef VLOAD_TYPES
+#undef VLOAD_ADDR_SPACES
+#undef VLOAD_VECTORIZE
+#undef VLOAD_VECTORIZE_GENERIC
diff --git a/libclc/opencl/lib/generic/shared/vload_half.inc b/libclc/clc/lib/generic/shared/clc_vload_half.inc
similarity index 100%
rename from libclc/opencl/lib/generic/shared/vload_half.inc
rename to libclc/clc/lib/generic/shared/clc_vload_half.inc
diff --git a/libclc/clc/lib/generic/shared/clc_vstore.cl b/libclc/clc/lib/generic/shared/clc_vstore.cl
new file mode 100644
index 000000000000..adde58aec915
--- /dev/null
+++ b/libclc/clc/lib/generic/shared/clc_vstore.cl
@@ -0,0 +1,268 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/float/definitions.h>
+#include <clc/internal/clc.h>
+#include <clc/math/clc_copysign.h>
+#include <clc/math/clc_fabs.h>
+#include <clc/math/clc_nextafter.h>
+#include <clc/relational/clc_isinf.h>
+#include <clc/relational/clc_isnan.h>
+#include <clc/shared/clc_min.h>
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE)                                \
+  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2                 \
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
+  _CLC_OVERLOAD _CLC_DEF void __clc_vstore2(PRIM_TYPE##2 vec, size_t offset,   \
+                                            ADDR_SPACE PRIM_TYPE *mem) {       \
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2                      \
+           *)(&mem[2 * offset])) = vec;                                        \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF void __clc_vstore3(PRIM_TYPE##3 vec, size_t offset,   \
+                                            ADDR_SPACE PRIM_TYPE *mem) {       \
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2                      \
+           *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1);             \
+    mem[3 * offset + 2] = vec.s2;                                              \
+  }                                                                            \
+                                                                               \
+  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4                 \
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
+  _CLC_OVERLOAD _CLC_DEF void __clc_vstore4(PRIM_TYPE##4 vec, size_t offset,   \
+                                            ADDR_SPACE PRIM_TYPE *mem) {       \
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4                      \
+           *)(&mem[4 * offset])) = vec;                                        \
+  }                                                                            \
+                                                                               \
+  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8                 \
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
+  _CLC_OVERLOAD _CLC_DEF void __clc_vstore8(PRIM_TYPE##8 vec, size_t offset,   \
+                                            ADDR_SPACE PRIM_TYPE *mem) {       \
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8                      \
+           *)(&mem[8 * offset])) = vec;                                        \
+  }                                                                            \
+                                                                               \
+  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16               \
+      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
+  _CLC_OVERLOAD _CLC_DEF void __clc_vstore16(PRIM_TYPE##16 vec, size_t offset, \
+                                             ADDR_SPACE PRIM_TYPE *mem) {      \
+    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16                     \
+           *)(&mem[16 * offset])) = vec;                                       \
+  }
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE
+#else
+// The generic address space isn't available, so make the macro do nothing
+#define VSTORE_VECTORIZE_GENERIC(X, Y)
+#endif
+
+#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE)                         \
+  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private)                      \
+  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local)                        \
+  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global)                       \
+  VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic)
+
+VSTORE_ADDR_SPACES(char)
+VSTORE_ADDR_SPACES(uchar)
+VSTORE_ADDR_SPACES(short)
+VSTORE_ADDR_SPACES(ushort)
+VSTORE_ADDR_SPACES(int)
+VSTORE_ADDR_SPACES(uint)
+VSTORE_ADDR_SPACES(long)
+VSTORE_ADDR_SPACES(ulong)
+VSTORE_ADDR_SPACES(float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+VSTORE_ADDR_SPACES(double)
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+VSTORE_ADDR_SPACES(half)
+#endif
+
+#define VEC_STORE1(val, ROUNDF, BUILTIN) BUILTIN(ROUNDF(val), &mem[offset++]);
+
+#define VEC_STORE2(val, ROUNDF, BUILTIN)                                       \
+  VEC_STORE1(val.lo, ROUNDF, BUILTIN)                                          \
+  VEC_STORE1(val.hi, ROUNDF, BUILTIN)
+#define VEC_STORE3(val, ROUNDF, BUILTIN)                                       \
+  VEC_STORE1(val.s0, ROUNDF, BUILTIN)                                          \
+  VEC_STORE1(val.s1, ROUNDF, BUILTIN)                                          \
+  VEC_STORE1(val.s2, ROUNDF, BUILTIN)
+#define VEC_STORE4(val, ROUNDF, BUILTIN)                                       \
+  VEC_STORE2(val.lo, ROUNDF, BUILTIN)                                          \
+  VEC_STORE2(val.hi, ROUNDF, BUILTIN)
+#define VEC_STORE8(val, ROUNDF, BUILTIN)                                       \
+  VEC_STORE4(val.lo, ROUNDF, BUILTIN)                                          \
+  VEC_STORE4(val.hi, ROUNDF, BUILTIN)
+#define VEC_STORE16(val, ROUNDF, BUILTIN)                                      \
+  VEC_STORE8(val.lo, ROUNDF, BUILTIN)                                          \
+  VEC_STORE8(val.hi, ROUNDF, BUILTIN)
+
+#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN)            \
+  _CLC_OVERLOAD _CLC_DEF void __clc_vstore_half##SUFFIX(                       \
+      TYPE vec, size_t offset, AS half *mem) {                                 \
+    offset *= VEC_SIZE;                                                        \
+    VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN)                                  \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF void __clc_vstorea_half##SUFFIX(                      \
+      TYPE vec, size_t offset, AS half *mem) {                                 \
+    offset *= OFFSET;                                                          \
+    VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN)                                  \
+  }
+
+_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; }
+_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
+  /* Remove lower 13 bits to make sure the number is rounded down */
+  int mask = 0xffffe000;
+  const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127;
+  /* Denormals cannot be flushed, and they use different bit for rounding */
+  if (exp < -14)
+    mask <<= __clc_min(-(exp + 14), 10);
+  /* RTZ does not produce Inf for large numbers */
+  if (__clc_fabs(x) > 65504.0f && !__clc_isinf(x))
+    return __clc_copysign(65504.0f, x);
+  /* Handle nan corner case */
+  if (__clc_isnan(x))
+    return x;
+  return __clc_as_float(__clc_as_uint(x) & mask);
+}
+_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
+  const float inf = __clc_copysign(INFINITY, x);
+  /* Set lower 13 bits */
+  int mask = (1 << 13) - 1;
+  const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127;
+  /* Denormals cannot be flushed, and they use different bit for rounding */
+  if (exp < -14)
+    mask = (1 << (13 + __clc_min(-(exp + 14), 10))) - 1;
+  /* Handle nan corner case */
+  if (__clc_isnan(x))
+    return x;
+  const float next =
+      __clc_nextafter(__clc_as_float(__clc_as_uint(x) | mask), inf);
+  return ((__clc_as_uint(x) & mask) == 0) ? x : next;
+}
+_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
+  return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
+}
+_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) {
+  return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
+}
+_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
+  /* Mantisa + implicit bit */
+  const uint mantissa = (__clc_as_uint(x) & 0x7fffff) | (1u << 23);
+  const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127;
+  int shift = 13;
+  if (exp < -14) {
+    /* The default assumes lower 13 bits are rounded,
+     * but it might be more for denormals.
+     * Shifting beyond last == 0b, and qr == 00b is not necessary */
+    shift += __clc_min(-(exp + 14), 15);
+  }
+  int mask = (1 << shift) - 1;
+  const uint grs = mantissa & mask;
+  const uint last = mantissa & (1 << shift);
+  /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
+   * exp > 15 should round to inf. */
+  bool roundup = (grs > (1 << (shift - 1))) ||
+                 (grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
+  return roundup ? __clc_rti(x) : __clc_rtz(x);
+}
+
+#ifdef cl_khr_fp64
+_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; }
+_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) {
+  /* Remove lower 42 bits to make sure the number is rounded down */
+  ulong mask = 0xfffffc0000000000UL;
+  const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023;
+  /* Denormals cannot be flushed, and they use different bit for rounding */
+  if (exp < -14)
+    mask <<= __clc_min(-(exp + 14), 10);
+  /* RTZ does not produce Inf for large numbers */
+  if (__clc_fabs(x) > 65504.0 && !__clc_isinf(x))
+    return __clc_copysign(65504.0, x);
+  /* Handle nan corner case */
+  if (__clc_isnan(x))
+    return x;
+  return __clc_as_double(__clc_as_ulong(x) & mask);
+}
+_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) {
+  const double inf = __clc_copysign((double)INFINITY, x);
+  /* Set lower 42 bits */
+  long mask = (1UL << 42UL) - 1UL;
+  const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023;
+  /* Denormals cannot be flushed, and they use different bit for rounding */
+  if (exp < -14)
+    mask = (1UL << (42UL + __clc_min(-(exp + 14), 10))) - 1;
+  /* Handle nan corner case */
+  if (__clc_isnan(x))
+    return x;
+  const double next =
+      __clc_nextafter(__clc_as_double(__clc_as_ulong(x) | mask), inf);
+  return ((__clc_as_ulong(x) & mask) == 0) ? x : next;
+}
+_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) {
+  return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x)
+                                                           : __clc_rti(x);
+}
+_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) {
+  return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x)
+                                                           : __clc_rtz(x);
+}
+_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) {
+  /* Mantisa + implicit bit */
+  const ulong mantissa = (__clc_as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
+  const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023;
+  int shift = 42;
+  if (exp < -14) {
+    /* The default assumes lower 13 bits are rounded,
+     * but it might be more for denormals.
+     * Shifting beyond last == 0b, and qr == 00b is not necessary */
+    shift += __clc_min(-(exp + 14), 15);
+  }
+  ulong mask = (1UL << shift) - 1UL;
+  const ulong grs = mantissa & mask;
+  const ulong last = mantissa & (1UL << shift);
+  /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
+   * exp > 15 should round to inf. */
+  bool roundup = (grs > (1UL << (shift - 1UL))) ||
+                 (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
+  return roundup ? __clc_rti(x) : __clc_rtz(x);
+}
+#endif
+
+#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN)                   \
+  __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN)              \
+  __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, BUILTIN)         \
+  __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, BUILTIN)         \
+  __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, BUILTIN)         \
+  __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN)
+
+#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN)                      \
+  __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN)
+
+#define __CLC_BODY "clc_vstore_half.inc"
+#include <clc/math/gentype.inc>
+#undef FUNC
+#undef __XFUNC
+#undef __FUNC
+#undef VEC_LOAD16
+#undef VEC_LOAD8
+#undef VEC_LOAD4
+#undef VEC_LOAD3
+#undef VEC_LOAD2
+#undef VEC_LOAD1
+#undef DECLARE_HELPER
+#undef VSTORE_ADDR_SPACES
+#undef VSTORE_VECTORIZE
+#undef VSTORE_VECTORIZE_GENERIC
diff --git a/libclc/opencl/lib/generic/shared/vstore_half.inc b/libclc/clc/lib/generic/shared/clc_vstore_half.inc
similarity index 100%
rename from libclc/opencl/lib/generic/shared/vstore_half.inc
rename to libclc/clc/lib/generic/shared/clc_vstore_half.inc
diff --git a/libclc/opencl/lib/generic/shared/vload.cl b/libclc/opencl/lib/generic/shared/vload.cl
index 4bfb5a012ce1..ad2283958013 100644
--- a/libclc/opencl/lib/generic/shared/vload.cl
+++ b/libclc/opencl/lib/generic/shared/vload.cl
@@ -7,134 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include <clc/opencl/clc.h>
+#include <clc/shared/clc_vload.h>
 
-#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE)                                 \
-  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2                 \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset,                    \
-                                             const ADDR_SPACE PRIM_TYPE *x) {  \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2         \
-                  *)(&x[2 * offset]));                                         \
-  }                                                                            \
-                                                                               \
-  typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3                 \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset,                    \
-                                             const ADDR_SPACE PRIM_TYPE *x) {  \
-    PRIM_TYPE##2 vec =                                                         \
-        *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2            \
-               *)(&x[3 * offset]));                                            \
-    return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]);                  \
-  }                                                                            \
-                                                                               \
-  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4                 \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset,                    \
-                                             const ADDR_SPACE PRIM_TYPE *x) {  \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4         \
-                  *)(&x[4 * offset]));                                         \
-  }                                                                            \
-                                                                               \
-  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8                 \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset,                    \
-                                             const ADDR_SPACE PRIM_TYPE *x) {  \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8         \
-                  *)(&x[8 * offset]));                                         \
-  }                                                                            \
-                                                                               \
-  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16               \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(                                \
-      size_t offset, const ADDR_SPACE PRIM_TYPE *x) {                          \
-    return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16        \
-                  *)(&x[16 * offset]));                                        \
-  }
+#define __CLC_BODY "vload.inc"
+#include <clc/integer/gentype.inc>
 
-#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
-#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE
-#else
-// The generic address space isn't available, so make the macro do nothing
-#define VLOAD_VECTORIZE_GENERIC(X, Y)
-#endif
-
-#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE)                                \
-  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private)                             \
-  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local)                               \
-  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant)                            \
-  VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global)                              \
-  VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic)
-
-#define VLOAD_TYPES()                                                          \
-  VLOAD_ADDR_SPACES(char)                                                      \
-  VLOAD_ADDR_SPACES(uchar)                                                     \
-  VLOAD_ADDR_SPACES(short)                                                     \
-  VLOAD_ADDR_SPACES(ushort)                                                    \
-  VLOAD_ADDR_SPACES(int)                                                       \
-  VLOAD_ADDR_SPACES(uint)                                                      \
-  VLOAD_ADDR_SPACES(long)                                                      \
-  VLOAD_ADDR_SPACES(ulong)                                                     \
-  VLOAD_ADDR_SPACES(float)
-
-VLOAD_TYPES()
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-VLOAD_ADDR_SPACES(double)
-#endif
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-VLOAD_ADDR_SPACES(half)
-#endif
-
-/* vload_half are legal even without cl_khr_fp16 */
-/* no vload_half for double */
-#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]);
-#define VEC_LOAD2(val, AS)                                                     \
-  VEC_LOAD1(val.lo, AS)                                                        \
-  VEC_LOAD1(val.hi, AS)
-#define VEC_LOAD3(val, AS)                                                     \
-  VEC_LOAD1(val.s0, AS)                                                        \
-  VEC_LOAD1(val.s1, AS)                                                        \
-  VEC_LOAD1(val.s2, AS)
-#define VEC_LOAD4(val, AS)                                                     \
-  VEC_LOAD2(val.lo, AS)                                                        \
-  VEC_LOAD2(val.hi, AS)
-#define VEC_LOAD8(val, AS)                                                     \
-  VEC_LOAD4(val.lo, AS)                                                        \
-  VEC_LOAD4(val.hi, AS)
-#define VEC_LOAD16(val, AS)                                                    \
-  VEC_LOAD8(val.lo, AS)                                                        \
-  VEC_LOAD8(val.hi, AS)
-
-#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)                        \
-  _CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset,                \
-                                                 const AS half *mem) {         \
-    offset *= VEC_SIZE;                                                        \
-    TYPE __tmp;                                                                \
-    VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp;                                \
-  }                                                                            \
-  _CLC_OVERLOAD _CLC_DEF TYPE vloada_half##SUFFIX(size_t offset,               \
-                                                  const AS half *mem) {        \
-    offset *= OFFSET_SIZE;                                                     \
-    TYPE __tmp;                                                                \
-    VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp;                                \
-  }
-
-#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)                          \
-  __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS)
-
-#define __CLC_BODY "vload_half.inc"
+#define __CLC_BODY "vload.inc"
 #include <clc/math/gentype.inc>
-#undef FUNC
-#undef __FUNC
-#undef VEC_LOAD16
-#undef VEC_LOAD8
-#undef VEC_LOAD4
-#undef VEC_LOAD3
-#undef VEC_LOAD2
-#undef VEC_LOAD1
-#undef VLOAD_TYPES
-#undef VLOAD_ADDR_SPACES
-#undef VLOAD_VECTORIZE
-#undef VLOAD_VECTORIZE_GENERIC
diff --git a/libclc/opencl/lib/generic/shared/vload.inc b/libclc/opencl/lib/generic/shared/vload.inc
new file mode 100644
index 000000000000..62cb040aad18
--- /dev/null
+++ b/libclc/opencl/lib/generic/shared/vload.inc
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define CLC_VLOAD_NAME(x) __CLC_XCONCAT(__CLC_XCONCAT(x, vload), __CLC_VECSIZE)
+#define CLC_VLOAD_HALF_NAME(x)                                                 \
+  __CLC_XCONCAT(__CLC_XCONCAT(x, vload_half), __CLC_VECSIZE)
+#define CLC_VLOADA_HALF_NAME(x)                                                \
+  __CLC_XCONCAT(__CLC_XCONCAT(x, vloada_half), __CLC_VECSIZE)
+
+#ifndef __CLC_SCALAR
+
+#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE)
+
+#define VLOAD_DEF(ADDRSPACE)                                                   \
+  _CLC_OVERLOAD _CLC_DEF CLC_VLOAD_TY CLC_VLOAD_NAME()(                        \
+      size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x) {                \
+    return CLC_VLOAD_NAME(__clc_)(offset, x);                                  \
+  }
+
+VLOAD_DEF(__private)
+VLOAD_DEF(__local)
+VLOAD_DEF(__constant)
+VLOAD_DEF(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+VLOAD_DEF(__generic)
+#endif
+
+#undef VLOAD_DEF
+#undef CLC_VLOAD_TY
+
+#endif
+
+// vload_half and vloada_half are available even if cl_khr_fp16 is unavailable.
+// Declare these functions when working on float types, which we know are
+// always available.
+#ifdef __CLC_FPSIZE
+#if __CLC_FPSIZE == 32
+
+#define VLOAD_HALF_DEF(ADDRSPACE)                                              \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOAD_HALF_NAME()(                  \
+      size_t offset, const ADDRSPACE half *mem) {                              \
+    return CLC_VLOAD_HALF_NAME(__clc_)(offset, mem);                           \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOADA_HALF_NAME()(                 \
+      size_t offset, const ADDRSPACE half *mem) {                              \
+    return CLC_VLOADA_HALF_NAME(__clc_)(offset, mem);                          \
+  }
+
+VLOAD_HALF_DEF(__private)
+VLOAD_HALF_DEF(__local)
+VLOAD_HALF_DEF(__constant)
+VLOAD_HALF_DEF(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+VLOAD_HALF_DEF(__generic)
+#endif
+
+#undef VLOAD_HALF_DEF
+#endif
+#endif
+
+#undef CLC_VLOAD_NAME
+#undef CLC_VLOAD_HALF_NAME
+#undef CLC_VLOADA_HALF_NAME
diff --git a/libclc/opencl/lib/generic/shared/vstore.cl b/libclc/opencl/lib/generic/shared/vstore.cl
index fe4890defe84..145658f873dc 100644
--- a/libclc/opencl/lib/generic/shared/vstore.cl
+++ b/libclc/opencl/lib/generic/shared/vstore.cl
@@ -7,253 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include <clc/opencl/clc.h>
+#include <clc/shared/clc_vstore.h>
 
-#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+#define __CLC_BODY "vstore.inc"
+#include <clc/integer/gentype.inc>
 
-#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE)                                \
-  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2                 \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset,         \
-                                      ADDR_SPACE PRIM_TYPE *mem) {             \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2                      \
-           *)(&mem[2 * offset])) = vec;                                        \
-  }                                                                            \
-                                                                               \
-  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset,         \
-                                      ADDR_SPACE PRIM_TYPE *mem) {             \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2                      \
-           *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1);             \
-    mem[3 * offset + 2] = vec.s2;                                              \
-  }                                                                            \
-                                                                               \
-  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4                 \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset,         \
-                                      ADDR_SPACE PRIM_TYPE *mem) {             \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4                      \
-           *)(&mem[4 * offset])) = vec;                                        \
-  }                                                                            \
-                                                                               \
-  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8                 \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset,         \
-                                      ADDR_SPACE PRIM_TYPE *mem) {             \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8                      \
-           *)(&mem[8 * offset])) = vec;                                        \
-  }                                                                            \
-                                                                               \
-  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16               \
-      __attribute__((aligned(sizeof(PRIM_TYPE))));                             \
-  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset,       \
-                                       ADDR_SPACE PRIM_TYPE *mem) {            \
-    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16                     \
-           *)(&mem[16 * offset])) = vec;                                       \
-  }
-
-#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
-#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE
-#else
-// The generic address space isn't available, so make the macro do nothing
-#define VSTORE_VECTORIZE_GENERIC(X, Y)
-#endif
-
-#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE)                         \
-  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private)                      \
-  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local)                        \
-  VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global)                       \
-  VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic)
-
-VSTORE_ADDR_SPACES(char)
-VSTORE_ADDR_SPACES(uchar)
-VSTORE_ADDR_SPACES(short)
-VSTORE_ADDR_SPACES(ushort)
-VSTORE_ADDR_SPACES(int)
-VSTORE_ADDR_SPACES(uint)
-VSTORE_ADDR_SPACES(long)
-VSTORE_ADDR_SPACES(ulong)
-VSTORE_ADDR_SPACES(float)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-VSTORE_ADDR_SPACES(double)
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-VSTORE_ADDR_SPACES(half)
-#endif
-
-#define VEC_STORE1(val, ROUNDF, BUILTIN) BUILTIN(ROUNDF(val), &mem[offset++]);
-
-#define VEC_STORE2(val, ROUNDF, BUILTIN)                                       \
-  VEC_STORE1(val.lo, ROUNDF, BUILTIN)                                          \
-  VEC_STORE1(val.hi, ROUNDF, BUILTIN)
-#define VEC_STORE3(val, ROUNDF, BUILTIN)                                       \
-  VEC_STORE1(val.s0, ROUNDF, BUILTIN)                                          \
-  VEC_STORE1(val.s1, ROUNDF, BUILTIN)                                          \
-  VEC_STORE1(val.s2, ROUNDF, BUILTIN)
-#define VEC_STORE4(val, ROUNDF, BUILTIN)                                       \
-  VEC_STORE2(val.lo, ROUNDF, BUILTIN)                                          \
-  VEC_STORE2(val.hi, ROUNDF, BUILTIN)
-#define VEC_STORE8(val, ROUNDF, BUILTIN)                                       \
-  VEC_STORE4(val.lo, ROUNDF, BUILTIN)                                          \
-  VEC_STORE4(val.hi, ROUNDF, BUILTIN)
-#define VEC_STORE16(val, ROUNDF, BUILTIN)                                      \
-  VEC_STORE8(val.lo, ROUNDF, BUILTIN)                                          \
-  VEC_STORE8(val.hi, ROUNDF, BUILTIN)
-
-#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN)            \
-  _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset,     \
-                                                  AS half *mem) {              \
-    offset *= VEC_SIZE;                                                        \
-    VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN)                                  \
-  }                                                                            \
-  _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset,    \
-                                                   AS half *mem) {             \
-    offset *= OFFSET;                                                          \
-    VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN)                                  \
-  }
-
-_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; }
-_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
-  /* Remove lower 13 bits to make sure the number is rounded down */
-  int mask = 0xffffe000;
-  const int exp = (as_uint(x) >> 23 & 0xff) - 127;
-  /* Denormals cannot be flushed, and they use different bit for rounding */
-  if (exp < -14)
-    mask <<= min(-(exp + 14), 10);
-  /* RTZ does not produce Inf for large numbers */
-  if (fabs(x) > 65504.0f && !isinf(x))
-    return copysign(65504.0f, x);
-  /* Handle nan corner case */
-  if (isnan(x))
-    return x;
-  return as_float(as_uint(x) & mask);
-}
-_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
-  const float inf = copysign(INFINITY, x);
-  /* Set lower 13 bits */
-  int mask = (1 << 13) - 1;
-  const int exp = (as_uint(x) >> 23 & 0xff) - 127;
-  /* Denormals cannot be flushed, and they use different bit for rounding */
-  if (exp < -14)
-    mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
-  /* Handle nan corner case */
-  if (isnan(x))
-    return x;
-  const float next = nextafter(as_float(as_uint(x) | mask), inf);
-  return ((as_uint(x) & mask) == 0) ? x : next;
-}
-_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
-  return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
-}
-_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) {
-  return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
-}
-_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
-  /* Mantisa + implicit bit */
-  const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
-  const int exp = (as_uint(x) >> 23 & 0xff) - 127;
-  int shift = 13;
-  if (exp < -14) {
-    /* The default assumes lower 13 bits are rounded,
-     * but it might be more for denormals.
-     * Shifting beyond last == 0b, and qr == 00b is not necessary */
-    shift += min(-(exp + 14), 15);
-  }
-  int mask = (1 << shift) - 1;
-  const uint grs = mantissa & mask;
-  const uint last = mantissa & (1 << shift);
-  /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
-   * exp > 15 should round to inf. */
-  bool roundup = (grs > (1 << (shift - 1))) ||
-                 (grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
-  return roundup ? __clc_rti(x) : __clc_rtz(x);
-}
-
-#ifdef cl_khr_fp64
-_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; }
-_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) {
-  /* Remove lower 42 bits to make sure the number is rounded down */
-  ulong mask = 0xfffffc0000000000UL;
-  const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
-  /* Denormals cannot be flushed, and they use different bit for rounding */
-  if (exp < -14)
-    mask <<= min(-(exp + 14), 10);
-  /* RTZ does not produce Inf for large numbers */
-  if (fabs(x) > 65504.0 && !isinf(x))
-    return copysign(65504.0, x);
-  /* Handle nan corner case */
-  if (isnan(x))
-    return x;
-  return as_double(as_ulong(x) & mask);
-}
-_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) {
-  const double inf = copysign((double)INFINITY, x);
-  /* Set lower 42 bits */
-  long mask = (1UL << 42UL) - 1UL;
-  const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
-  /* Denormals cannot be flushed, and they use different bit for rounding */
-  if (exp < -14)
-    mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1;
-  /* Handle nan corner case */
-  if (isnan(x))
-    return x;
-  const double next = nextafter(as_double(as_ulong(x) | mask), inf);
-  return ((as_ulong(x) & mask) == 0) ? x : next;
-}
-_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) {
-  return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x)
-                                                     : __clc_rti(x);
-}
-_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) {
-  return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x)
-                                                     : __clc_rtz(x);
-}
-_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) {
-  /* Mantisa + implicit bit */
-  const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
-  const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
-  int shift = 42;
-  if (exp < -14) {
-    /* The default assumes lower 13 bits are rounded,
-     * but it might be more for denormals.
-     * Shifting beyond last == 0b, and qr == 00b is not necessary */
-    shift += min(-(exp + 14), 15);
-  }
-  ulong mask = (1UL << shift) - 1UL;
-  const ulong grs = mantissa & mask;
-  const ulong last = mantissa & (1UL << shift);
-  /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
-   * exp > 15 should round to inf. */
-  bool roundup = (grs > (1UL << (shift - 1UL))) ||
-                 (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
-  return roundup ? __clc_rti(x) : __clc_rtz(x);
-}
-#endif
-
-#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN)                   \
-  __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN)              \
-  __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, BUILTIN)         \
-  __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, BUILTIN)         \
-  __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, BUILTIN)         \
-  __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN)
-
-#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN)                      \
-  __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN)
-
-#define __CLC_BODY "vstore_half.inc"
+#define __CLC_BODY "vstore.inc"
 #include <clc/math/gentype.inc>
-#undef FUNC
-#undef __XFUNC
-#undef __FUNC
-#undef VEC_LOAD16
-#undef VEC_LOAD8
-#undef VEC_LOAD4
-#undef VEC_LOAD3
-#undef VEC_LOAD2
-#undef VEC_LOAD1
-#undef DECLARE_HELPER
-#undef VSTORE_ADDR_SPACES
-#undef VSTORE_VECTORIZE
-#undef VSTORE_VECTORIZE_GENERIC
diff --git a/libclc/opencl/lib/generic/shared/vstore.inc b/libclc/opencl/lib/generic/shared/vstore.inc
new file mode 100644
index 000000000000..4bdce0719912
--- /dev/null
+++ b/libclc/opencl/lib/generic/shared/vstore.inc
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE)
+#define CLC_VSTORE_NAME(x)                                                     \
+  __CLC_XCONCAT(__CLC_XCONCAT(x, vstore), __CLC_VECSIZE)
+#define CLC_VSTORE_HALF_NAME(x, y)                                             \
+  __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstore_half), __CLC_VECSIZE), y)
+#define CLC_VSTOREA_HALF_NAME(x, y)                                            \
+  __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstorea_half), __CLC_VECSIZE), y)
+
+#ifndef __CLC_SCALAR
+
+#define CLC_VSTORE_DEF(ADDRSPACE)                                              \
+  _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_NAME()(                               \
+      CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p) {  \
+    return CLC_VSTORE_NAME(__clc_)(data, offset, p);                           \
+  }
+
+CLC_VSTORE_DEF(__private)
+CLC_VSTORE_DEF(__local)
+CLC_VSTORE_DEF(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+CLC_VSTORE_DEF(__generic)
+#endif
+
+#undef CLC_VSTORE_DEF
+
+#endif // __CLC_SCALAR
+
+// vstore_half and vstorea_half are available even if cl_khr_fp16 is
+// unavailable.
+#ifdef __CLC_FPSIZE
+#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
+
+#define CLC_VSTORE_HALF_DEF(ADDRSPACE, SUFFIX)                                 \
+  _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_HALF_NAME(, SUFFIX)(                  \
+      CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) {                  \
+    CLC_VSTORE_HALF_NAME(__clc_, SUFFIX)(data, offset, p);                     \
+  }                                                                            \
+                                                                               \
+  _CLC_OVERLOAD _CLC_DEF void CLC_VSTOREA_HALF_NAME(, SUFFIX)(                 \
+      CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) {                  \
+    CLC_VSTOREA_HALF_NAME(__clc_, SUFFIX)(data, offset, p);                    \
+  }
+
+#define CLC_VSTORE_HALF_DEF_ALL_MODES(ADDRSPACE)                               \
+  CLC_VSTORE_HALF_DEF(ADDRSPACE, )                                             \
+  CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtz)                                         \
+  CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtn)                                         \
+  CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtp)                                         \
+  CLC_VSTORE_HALF_DEF(ADDRSPACE, _rte)
+
+CLC_VSTORE_HALF_DEF_ALL_MODES(__private)
+CLC_VSTORE_HALF_DEF_ALL_MODES(__local)
+CLC_VSTORE_HALF_DEF_ALL_MODES(__global)
+
+#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
+CLC_VSTORE_HALF_DEF_ALL_MODES(__generic)
+#endif
+
+#undef CLC_VSTORE_HALF_DEF
+#undef CLC_VSTORE_HALF_DEF_ALL_MODES
+
+#endif
+#endif
+
+#undef CLC_VSTORE_TY
+#undef CLC_VSTORE_NAME
+#undef CLC_VSTORE_HALF_NAME
+#undef CLC_VSTOREA_HALF_NAME