From 7de73c4e9d5ee1ec00bb57427ac04746ce858c3c Mon Sep 17 00:00:00 2001 From: Abhinav Gaba Date: Mon, 29 Sep 2025 11:47:21 -0700 Subject: [PATCH] [OpenMP][Offload] Support `PRIVATE | ATTACH` maps for corresponding-pointer-initialization. (#160760) `PRIVATE | ATTACH` maps can be used to represent firstprivate pointers that should be initialized by doing doing the pointee's device address, if its lookup succeeds, or retain the original host pointee's address otherwise. With this, for a test like the following: ```f90 integer, pointer :: p(:) !$omp target map(p(1)) ... print*, p(1) !$omp end target ``` The codegen can look like: ```llvm ; maps for p: ; &p(1), &p(1), sizeof(p(1)), TO|FROM //(1) ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH //(2) ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE|ATTACH|PARAM //(3) call... @__omp_outlined...(ptr %ref_ptr_of_p) ``` * `(1)` maps the pointee `p(1)`. * `(2)` attaches it to the (previously) mapped `ref_ptr(p)`, if present. It can be controlled via OpenMP 6.1's `attach(auto/always/never)` map-type modifiers. * `(3)` privatizes and initializes the local `ref_ptr(p)`, which gets passed in as the kernel argument `%ref_ptr_of_p`. Can be skipped if p is not referenced directly within the region. While similar mapping can be used for C/C++, it's more important/useful for Fortran as we can avoid creating another argument for passing the descriptor, and use that to initialize the private copy in the body of the kernel. --- offload/libomptarget/omptarget.cpp | 369 ++++++++++++++++++++++++----- 1 file changed, 306 insertions(+), 63 deletions(-) diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 39286d41ec86..a1950cbb6290 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -330,6 +330,54 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, return Rc; } +/// Returns a buffer of the requested \p Size, to be used as the source for +/// `submitData`. +/// +/// For small buffers (`Size <= sizeof(void*)`), uses \p AsyncInfo's +/// getVoidPtrLocation(). +/// For larger buffers, creates a dynamic buffer which will be eventually +/// deleted by \p AsyncInfo's post-processing callback. +static char *getOrCreateSourceBufferForSubmitData(AsyncInfoTy &AsyncInfo, + int64_t Size) { + constexpr int64_t VoidPtrSize = sizeof(void *); + + if (Size <= VoidPtrSize) { + void *&BufferElement = AsyncInfo.getVoidPtrLocation(); + return reinterpret_cast(&BufferElement); + } + + // Create a dynamic buffer for larger data and schedule its deletion. + char *DataBuffer = new char[Size]; + AsyncInfo.addPostProcessingFunction([DataBuffer]() { + delete[] DataBuffer; + return OFFLOAD_SUCCESS; + }); + return DataBuffer; +} + +/// Calculates the target pointee base by applying the host +/// pointee begin/base delta to the target pointee begin. +/// +/// ``` +/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase) +/// ``` +static void *calculateTargetPointeeBase(void *HstPteeBase, void *HstPteeBegin, + void *TgtPteeBegin) { + uint64_t Delta = reinterpret_cast(HstPteeBegin) - + reinterpret_cast(HstPteeBase); + void *TgtPteeBase = reinterpret_cast( + reinterpret_cast(TgtPteeBegin) - Delta); + + DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD + ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", + DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); + DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD + "\n", + DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); + + return TgtPteeBase; +} + /// Utility function to perform a pointer attachment operation. /// /// For something like: @@ -399,16 +447,8 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, constexpr int64_t VoidPtrSize = sizeof(void *); assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small"); - uint64_t Delta = reinterpret_cast(HstPteeBegin) - - reinterpret_cast(HstPteeBase); - void *TgtPteeBase = reinterpret_cast( - reinterpret_cast(TgtPteeBegin) - Delta); - DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD - ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", - DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); - DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD - "\n", - DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); + void *TgtPteeBase = + calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, TgtPteeBegin); // Add shadow pointer tracking if (!PtrTPR.getEntry()->addShadowPointer( @@ -435,48 +475,32 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, return OFFLOAD_SUCCESS; }; - bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize; - if (!IsPtrAFortranDescriptor) { - // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as - // the buffer space for the submission. - void *&BufferElement = AsyncInfo.getVoidPtrLocation(); - BufferElement = TgtPteeBase; + // Get a buffer to be used as the source for data submission. + char *SrcBuffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize); - // Submit the updated pointer value to device - return HandleSubmitResult(Device.submitData( - TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry())); + // The pointee's address should occupy the first VoidPtrSize bytes + // irrespective of HstPtrSize. + std::memcpy(SrcBuffer, &TgtPteeBase, VoidPtrSize); + + // For larger "pointers" (e.g., Fortran descriptors), copy remaining + // descriptor fields from the host descriptor into the buffer. + if (HstPtrSize > VoidPtrSize) { + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = + reinterpret_cast(HstPtrAddr) + VoidPtrSize; + std::memcpy(SrcBuffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + + DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD + ") (pointer + %" PRId64 " additional bytes from host descriptor " DPxMOD + ")\n", + HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize, + DPxPTR(HstDescriptorFieldsAddr)); } - // For larger "pointers" (like Fortran's descriptors), we create a dynamic - // buffer, which will be eventually destroyed by AsyncInfo's post-processing - // callback. - char *DataBuffer = new char[HstPtrSize]; - - // For such descriptors, to the first VoidPtrSize bytes, we store the - // pointee's device address. - std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *)); - - // And to the remaining bytes, we copy the remaining contents of the host - // descriptor after the initial VoidPtrSize bytes. - uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; - void *HstDescriptorFieldsAddr = - reinterpret_cast(HstPtrAddr) + VoidPtrSize; - std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr, - HstDescriptorFieldsSize); - - DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64 - " additional bytes from host descriptor " DPxMOD ")\n", - HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize, - DPxPTR(HstDescriptorFieldsAddr)); - - // Submit the entire buffer to device - int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize, + // Submit the populated source buffer to device. + int SubmitResult = Device.submitData(TgtPtrAddr, SrcBuffer, HstPtrSize, AsyncInfo, PtrTPR.getEntry()); - - AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int { - delete[] DataBuffer; - return OFFLOAD_SUCCESS; - }); return HandleSubmitResult(SubmitResult); } @@ -525,10 +549,17 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, // ATTACH map-types are supposed to be handled after all mapping for the // construct is done. Defer their processing. if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { - AttachInfo->AttachEntries.emplace_back( - /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin, - /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I], - /*PointeeName=*/HstPtrName); + const bool IsCorrespondingPointerInit = + (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE); + // We don't need to keep track of PRIVATE | ATTACH entries. They + // represent corresponding-pointer-initialization, and are handled + // similar to firstprivate (PRIVATE | TO) entries by + // PrivateArgumentManager. + if (!IsCorrespondingPointerInit) + AttachInfo->AttachEntries.emplace_back( + /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin, + /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I], + /*PointeeName=*/HstPtrName); DP("Deferring ATTACH map-type processing for argument %d\n", I); continue; @@ -1397,13 +1428,24 @@ class PrivateArgumentManagerTy { uint32_t Padding; /// Host pointer name map_var_info_t HstPtrName = nullptr; + /// For corresponding-pointer-initialization: host pointee base address. + void *HstPteeBase = nullptr; + /// For corresponding-pointer-initialization: host pointee begin address. + void *HstPteeBegin = nullptr; + /// Whether this argument needs corresponding-pointer-initialization. + bool IsCorrespondingPointerInit = false; FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size, uint32_t Alignment, uint32_t Padding, - map_var_info_t HstPtrName = nullptr) + map_var_info_t HstPtrName = nullptr, + void *HstPteeBase = nullptr, + void *HstPteeBegin = nullptr, + bool IsCorrespondingPointerInit = false) : HstPtrBegin(reinterpret_cast(HstPtr)), HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment), - Size(Size), Padding(Padding), HstPtrName(HstPtrName) {} + Size(Size), Padding(Padding), HstPtrName(HstPtrName), + HstPteeBase(HstPteeBase), HstPteeBegin(HstPteeBegin), + IsCorrespondingPointerInit(IsCorrespondingPointerInit) {} }; /// A vector of target pointers for all private arguments @@ -1421,6 +1463,153 @@ class PrivateArgumentManagerTy { /// A pointer to a \p AsyncInfoTy object AsyncInfoTy &AsyncInfo; + /// \returns the value of the target pointee's base to be used for + /// corresponding-pointer-initialization. + void *getTargetPointeeBaseForCorrespondingPointerInitialization( + void *HstPteeBase, void *HstPteeBegin) { + // See if the pointee's begin address has corresponding storage on device. + void *TgtPteeBegin = [&]() -> void * { + if (!HstPteeBegin) { + DP("Corresponding-pointer-initialization: pointee begin address is " + "null\n"); + return nullptr; + } + + return Device.getMappingInfo() + .getTgtPtrBegin(HstPteeBegin, /*Size=*/0, /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false) + .TargetPointer; + }(); + + // If it does, we calculate target pointee base using it, and return it. + // Otherwise, we retain the host pointee's base as the target pointee base + // of the initialized pointer. It's the user's responsibility to ensure + // that if a lookup fails, the host pointee is accessible on the device. + return TgtPteeBegin ? calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, + TgtPteeBegin) + : HstPteeBase; + } + + /// Initialize the source buffer for corresponding-pointer-initialization. + /// + /// It computes and stores the target pointee base address (or the host + /// pointee's base address, if lookup of target pointee fails) to the first + /// `sizeof(void*)` bytes of \p Buffer, and for larger pointers + /// (Fortran descriptors), the remaining fields of the host descriptor + /// \p HstPtr after those `sizeof(void*)` bytes. + /// + /// Corresponding-pointer-initialization represents the initialization of the + /// private version of a base-pointer/referring-pointer on a target construct. + /// + /// For example, for the following test: + /// ```cpp + /// int x[10]; + /// int *px = &x[0]; + /// ... + /// #pragma omp target data map(tofrom:px) + /// { + /// int **ppx = omp_get_mapped_ptr(&px, omp_get_default_device()); + /// #pragma omp target map(tofrom:px[1]) is_device_ptr(ppx) + /// { + /// foo(px, ppx); + /// } + /// } + /// ``` + /// The following shows a possible way to implement the mapping of `px`, + /// which is pre-determined firstprivate and should get initialized + /// via corresponding-pointer-initialization: + /// + /// (A) Possible way to implement the above with PRIVATE | ATTACH: + /// ```llvm + /// ; maps for px: + /// ; &px[0], &px[1], sizeof(px[1]), TO | FROM // (1) + /// ; &px, &px[1], sizeof(px), ATTACH // (2) + /// ; &px, &px[1], sizeof(px), PRIVATE | ATTACH | PARAM // (3) + /// call... @__omp_outlined...(ptr %px, ptr %ppx) + /// define ... @__omp_outlined(ptr %px, ptr %ppx) {... + /// foo(%px, %ppx) + /// ...} + /// ``` + /// `(1)` maps the pointee `px[1]. + /// `(2)` attaches it to the mapped version of `px`. It can be controlled by + /// the user based on the `attach(auto/always/never)` map-type modifier. + /// `(3)` privatizes and initializes the private pointer `px`, and passes it + /// into the kernel as the argument `%px`. Can be skipped if `px` is not + /// referenced in the target construct. + /// + /// While this method is not too beneficial compared to just doing the + /// initialization in the body of the kernel, like: + /// (B) Possible way to implement the above without PRIVATE | ATTACH: + /// ```llvm + /// ; maps for px: + /// ; &px[0], &px[1], sizeof(px[1]), TO | FROM | PARAM // (4) + /// ; &px, &px[1], sizeof(px), ATTACH // (5) + /// call... @__omp_outlined...(ptr %px0, ptr %ppx) + /// define ... __omp_outlined...(ptr %px0, ptr %ppx) { + /// %px = alloca ptr; + /// store ptr %px0, ptr %px + /// foo(%px, %ppx) + /// } + /// ``` + /// + /// (B) is not so convenient for Fortran descriptors, because in + /// addition to the lookup, the remaining fields of the descriptor have + /// to be passed into the kernel to initialize the private copy, which + /// makes (A) a cleaner option for them. e.g. + /// ```f90 + /// integer, pointer :: p(:) + /// !$omp target map(p(1)) + /// ``` + /// + /// (C) Possible mapping for the above Fortran test using PRIVATE | ATTACH: + /// ```llvm + /// ; maps for p: + /// ; &p(1), &p(1), sizeof(p(1)), TO | FROM + /// ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH + /// ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE | ATTACH | PARAM + /// call... @__omp_outlined...(ptr %ref_ptr_of_p) + void initBufferForCorrespondingPointerInitialization(char *Buffer, + void *HstPtr, + int64_t HstPtrSize, + void *HstPteeBase, + void *HstPteeBegin) { + constexpr int64_t VoidPtrSize = sizeof(void *); + assert(HstPtrSize >= VoidPtrSize && + "corresponding-pointer-initialization: pointer size is too small"); + + void *TgtPteeBase = + getTargetPointeeBaseForCorrespondingPointerInitialization(HstPteeBase, + HstPteeBegin); + + // Store the target pointee base address to the first VoidPtrSize bytes + DP("Initializing corresponding-pointer-initialization source buffer " + "for " DPxMOD ", with pointee base " DPxMOD "\n", + DPxPTR(HstPtr), DPxPTR(TgtPteeBase)); + std::memcpy(Buffer, &TgtPteeBase, VoidPtrSize); + if (HstPtrSize <= VoidPtrSize) + return; + + // For Fortran descriptors, copy the remaining descriptor fields from host + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = static_cast(HstPtr) + VoidPtrSize; + DP("Copying %" PRId64 + " bytes of descriptor fields into corresponding-pointer-initialization " + "buffer at offset %" PRId64 ", from " DPxMOD "\n", + HstDescriptorFieldsSize, VoidPtrSize, DPxPTR(HstDescriptorFieldsAddr)); + std::memcpy(Buffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + } + + /// Helper function to create and initialize a buffer to be used as the source + /// for corresponding-pointer-initialization. + void *createAndInitSourceBufferForCorrespondingPointerInitialization( + void *HstPtr, int64_t HstPtrSize, void *HstPteeBase, void *HstPteeBegin) { + char *Buffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize); + initBufferForCorrespondingPointerInitialization(Buffer, HstPtr, HstPtrSize, + HstPteeBase, HstPteeBegin); + return Buffer; + } + // TODO: What would be the best value here? Should we make it configurable? // If the size is larger than this threshold, we will allocate and transfer it // immediately instead of packing it. @@ -1435,7 +1624,9 @@ public: int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, map_var_info_t HstPtrName = nullptr, - const bool AllocImmediately = false) { + const bool AllocImmediately = false, void *HstPteeBase = nullptr, + void *HstPteeBegin = nullptr, + bool IsCorrespondingPointerInit = false) { // If the argument is not first-private, or its size is greater than a // predefined threshold, we will allocate memory and issue the transfer // immediately. @@ -1458,9 +1649,19 @@ public: // If first-private, copy data from host if (IsFirstPrivate) { DP("Submitting firstprivate data to the device.\n"); - int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); + + // The source value used for corresponding-pointer-initialization + // is different vs regular firstprivates. + void *DataSource = + IsCorrespondingPointerInit + ? createAndInitSourceBufferForCorrespondingPointerInitialization( + HstPtr, ArgSize, HstPteeBase, HstPteeBegin) + : HstPtr; + int Ret = Device.submitData(TgtPtr, DataSource, ArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { - DP("Copying data to device failed, failed.\n"); + DP("Copying %s data to device failed.\n", + IsCorrespondingPointerInit ? "corresponding-pointer-initialization" + : "firstprivate"); return OFFLOAD_FAIL; } } @@ -1506,8 +1707,10 @@ public: } } - FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, - StartAlignment, Padding, HstPtrName); + FirstPrivateArgInfo.emplace_back( + TgtArgsIndex, HstPtr, ArgSize, StartAlignment, Padding, HstPtrName, + HstPteeBase, HstPteeBegin, IsCorrespondingPointerInit); + FirstPrivateArgSize += Padding + ArgSize; } @@ -1526,7 +1729,13 @@ public: for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { // First pad the pointer as we (have to) pad it on the device too. Itr = std::next(Itr, Info.Padding); - std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); + + if (Info.IsCorrespondingPointerInit) + initBufferForCorrespondingPointerInitialization( + &*Itr, Info.HstPtrBegin, Info.Size, Info.HstPteeBase, + Info.HstPteeBegin); + else + std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); Itr = std::next(Itr, Info.Size); } // Allocate target memory @@ -1682,8 +1891,40 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, TgtPtrBegin = HstPtrBase; TgtBaseOffset = 0; } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { + // For cases like: + // ``` + // int *p = ...; + // #pragma omp target map(p[0:10]) + // ``` + // `p` is predetermined firstprivate on the target construct, and the + // method to determine the initial value of the private copy on the + // device is called "corresponding-pointer-initialization". + // + // Such firstprivate pointers that need + // corresponding-pointer-initialization are represented using the + // `PRIVATE | ATTACH` map-types, in contrast to regular firstprivate + // entries, which use `PRIVATE | TO`. The structure of these + // `PRIVATE | ATTACH` entries is the same as the non-private + // `ATTACH` entries used to represent pointer-attachments, i.e.: + // ``` + // &hst_ptr_base/begin, &hst_ptee_begin, sizeof(hst_ptr) + // ``` + const bool IsAttach = (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH); + void *HstPteeBase = nullptr; + void *HstPteeBegin = nullptr; + if (IsAttach) { + // For corresponding-pointer-initialization, Args[I] is HstPteeBegin, + // and ArgBases[I] is both HstPtrBase/HstPtrBegin. + HstPteeBase = *reinterpret_cast(HstPtrBase); + HstPteeBegin = Args[I]; + HstPtrBegin = ArgBases[I]; + } TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; - const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); + // Corresponding-pointer-initialization is a special case of firstprivate, + // since it also involves initializing the private pointer. + const bool IsFirstPrivate = + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) || IsAttach; + // If there is a next argument and it depends on the current one, we need // to allocate the private memory immediately. If this is not the case, // then the argument can be marked for optimization and packed with the @@ -1692,9 +1933,11 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); Ret = PrivateArgumentManager.addArg( HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, - TgtArgs.size(), HstPtrName, AllocImmediately); + /*TgtArgsIndex=*/TgtArgs.size(), HstPtrName, AllocImmediately, + HstPteeBase, HstPteeBegin, /*IsCorrespondingPointerInit=*/IsAttach); if (Ret != OFFLOAD_SUCCESS) { - REPORT("Failed to process %sprivate argument " DPxMOD "\n", + REPORT("Failed to process %s%sprivate argument " DPxMOD "\n", + IsAttach ? "corresponding-pointer-initialization " : "", (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); return OFFLOAD_FAIL; }