[flang][cuda] Use a reference for asyncObject (#140614)

Switch from `int64_t` to `int64_t*` to fit with the rest of the implementation. New tentative with some fix. The previous was reverted some time ago. Reviewed in #138010
2025-05-19 15:02:53 -07:00 · 2025-05-19 15:02:53 -07:00 · f5609aa1b0
commit f5609aa1b0
parent a04cff172f
55 changed files with 183 additions and 184 deletions
--- a/flang-rt/include/flang-rt/runtime/allocator-registry.h
+++ b/flang-rt/include/flang-rt/runtime/allocator-registry.h
@ -19,7 +19,7 @@

 namespace Fortran::runtime {

-using AllocFct = void *(*)(std::size_t, std::int64_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t *);
 using FreeFct = void (*)(void *);

 typedef struct Allocator_t {
@ -28,7 +28,7 @@ typedef struct Allocator_t {
 } Allocator_t;

 static RT_API_ATTRS void *MallocWrapper(
-    std::size_t size, [[maybe_unused]] std::int64_t) {
+    std::size_t size, [[maybe_unused]] std::int64_t *) {
  return std::malloc(size);
 }
 #ifdef RT_DEVICE_COMPILATION
--- a/flang-rt/include/flang-rt/runtime/descriptor.h
+++ b/flang-rt/include/flang-rt/runtime/descriptor.h
@ -29,8 +29,8 @@
 #include <cstdio>
 #include <cstring>

-/// Value used for asyncId when no specific stream is specified.
-static constexpr std::int64_t kNoAsyncId = -1;
+/// Value used for asyncObject when no specific stream is specified.
+static constexpr std::int64_t *kNoAsyncObject = nullptr;

 namespace Fortran::runtime {

@ -372,7 +372,7 @@ public:
  // before calling.  It (re)computes the byte strides after
  // allocation.  Does not allocate automatic components or
  // perform default component initialization.
-  RT_API_ATTRS int Allocate(std::int64_t asyncId);
+  RT_API_ATTRS int Allocate(std::int64_t *asyncObject);
  RT_API_ATTRS void SetByteStrides();

  // Deallocates storage; does not call FINAL subroutines or
--- a/flang-rt/include/flang-rt/runtime/reduction-templates.h
+++ b/flang-rt/include/flang-rt/runtime/reduction-templates.h
@ -347,7 +347,7 @@ inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
    // as the element size of the source.
    result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
        CFI_attribute_allocatable);
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
      terminator.Crash(
          "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
    }
--- a/flang-rt/lib/cuda/CMakeLists.txt
+++ b/flang-rt/lib/cuda/CMakeLists.txt
@ -14,6 +14,7 @@ add_flangrt_library(flang_rt.cuda STATIC SHARED
  kernel.cpp
  memmove-function.cpp
  memory.cpp
+  pointer.cpp
  registration.cpp

  TARGET_PROPERTIES
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@ -23,7 +23,7 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN

-int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
    bool *pinned, bool hasStat, const Descriptor *errMsg,
    const char *sourceFile, int sourceLine) {
  int stat{RTNAME(CUFAllocatableAllocate)(
@ -41,7 +41,7 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
  return stat;
 }

-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
    bool *pinned, bool hasStat, const Descriptor *errMsg,
    const char *sourceFile, int sourceLine) {
  if (desc.HasAddendum()) {
@ -63,7 +63,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
 }

 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
  int stat{RTNAME(CUFAllocatableAllocate)(
      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
@ -76,7 +76,7 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }

 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
  int stat{RTNAME(CUFAllocatableAllocateSync)(
      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@ -98,7 +98,7 @@ static unsigned findAllocation(void *ptr) {
  return allocNotFound;
 }

-static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
+static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
  CriticalSection critical{lock};
  initAllocations();
  if (numDeviceAllocations >= maxDeviceAllocations) {
@ -106,7 +106,7 @@ static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
  }
  deviceAllocations[numDeviceAllocations].ptr = ptr;
  deviceAllocations[numDeviceAllocations].size = size;
-  deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
+  deviceAllocations[numDeviceAllocations].stream = stream;
  ++numDeviceAllocations;
  qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
      compareDeviceAlloc);
@ -136,7 +136,7 @@ void RTDEF(CUFRegisterAllocator)() {
 }

 void *CUFAllocPinned(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
  void *p;
  CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
  return p;
@ -144,18 +144,18 @@ void *CUFAllocPinned(

 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }

-void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
+void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
  void *p;
  if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
    CUDA_REPORT_IF_ERROR(
        cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
  } else {
-    if (asyncId == kNoAsyncId) {
+    if (asyncObject == kNoAsyncObject) {
      CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
    } else {
      CUDA_REPORT_IF_ERROR(
-          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
-      insertAllocation(p, sizeInBytes, asyncId);
+          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
+      insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
    }
  }
  return p;
@ -174,7 +174,7 @@ void CUFFreeDevice(void *p) {
 }

 void *CUFAllocManaged(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
  void *p;
  CUDA_REPORT_IF_ERROR(
      cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@ -184,9 +184,9 @@ void *CUFAllocManaged(
 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }

 void *CUFAllocUnified(
-    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
+    std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
  // Call alloc managed for the time being.
-  return CUFAllocManaged(sizeInBytes, asyncId);
+  return CUFAllocManaged(sizeInBytes, asyncObject);
 }

 void CUFFreeUnified(void *p) {
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@ -21,7 +21,7 @@ RT_EXT_API_GROUP_BEGIN
 Descriptor *RTDEF(CUFAllocDescriptor)(
    std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
  return reinterpret_cast<Descriptor *>(
-      CUFAllocManaged(sizeInBytes, /*asyncId*/ -1));
+      CUFAllocManaged(sizeInBytes, /*asyncObject=*/nullptr));
 }

 void RTDEF(CUFFreeDescriptor)(
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@ -22,7 +22,7 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN

-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
    int sourceLine) {
  if (desc.HasAddendum()) {
@ -43,7 +43,7 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
  return stat;
 }

-int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
+int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
    bool *pinned, bool hasStat, const Descriptor *errMsg,
    const char *sourceFile, int sourceLine) {
  int stat{RTNAME(CUFPointerAllocate)(
@ -62,7 +62,7 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
 }

 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
  int stat{RTNAME(CUFPointerAllocate)(
      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
@ -75,7 +75,7 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
 }

 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
  int stat{RTNAME(CUFPointerAllocateSync)(
      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@ -133,17 +133,17 @@ void RTDEF(AllocatableApplyMold)(
  }
 }

-int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
+    std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
  Terminator terminator{sourceFile, sourceLine};
  if (!descriptor.IsAllocatable()) {
    return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
  } else if (descriptor.IsAllocated()) {
    return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
  } else {
-    int stat{
-        ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
+    int stat{ReturnError(
+        terminator, descriptor.Allocate(asyncObject), errMsg, hasStat)};
    if (stat == StatOk) {
      if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
        if (const auto *derived{addendum->derivedType()}) {
@ -162,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
    const Descriptor &source, bool hasStat, const Descriptor *errMsg,
    const char *sourceFile, int sourceLine) {
  int stat{RTNAME(AllocatableAllocate)(
-      alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
  if (stat == StatOk) {
    Terminator terminator{sourceFile, sourceLine};
    DoFromSourceAssign(alloc, source, terminator);
--- a/flang-rt/lib/runtime/array-constructor.cpp
+++ b/flang-rt/lib/runtime/array-constructor.cpp
@ -50,7 +50,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
          initialAllocationSize(fromElements, to.ElementBytes())};
      to.GetDimension(0).SetBounds(1, allocationSize);
      RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
          vector.sourceFile, vector.sourceLine);
      to.GetDimension(0).SetBounds(1, fromElements);
      vector.actualAllocationSize = allocationSize;
@ -59,7 +59,7 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
      // first value: there should be no reallocation.
      RUNTIME_CHECK(terminator, previousToElements >= fromElements);
      RTNAME(AllocatableAllocate)
-      (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+      (to, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
          vector.sourceFile, vector.sourceLine);
      vector.actualAllocationSize = previousToElements;
    }
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@ -102,7 +102,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
    toDim.SetByteStride(stride);
    stride *= toDim.Extent();
  }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncId))};
+  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
  if (result == StatOk && derived && !derived->noInitializationNeeded()) {
    result = ReturnError(terminator, Initialize(to, *derived, terminator));
  }
@ -280,7 +280,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
      // entity, otherwise, the Deallocate() below will not
      // free the descriptor memory.
      newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncId))};
+      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
      if (stat == StatOk) {
        if (HasDynamicComponent(from)) {
          // If 'from' has allocatable/automatic component, we cannot
--- a/flang-rt/lib/runtime/character.cpp
+++ b/flang-rt/lib/runtime/character.cpp
@ -118,7 +118,7 @@ static RT_API_ATTRS void Compare(Descriptor &result, const Descriptor &x,
  for (int j{0}; j < rank; ++j) {
    result.GetDimension(j).SetBounds(1, ub[j]);
  }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
    terminator.Crash("Compare: could not allocate storage for result");
  }
  std::size_t xChars{x.ElementBytes() >> shift<CHAR>};
@ -173,7 +173,7 @@ static RT_API_ATTRS void AdjustLRHelper(Descriptor &result,
  for (int j{0}; j < rank; ++j) {
    result.GetDimension(j).SetBounds(1, ub[j]);
  }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
    terminator.Crash("ADJUSTL/R: could not allocate storage for result");
  }
  for (SubscriptValue resultAt{0}; elements-- > 0;
@ -227,7 +227,7 @@ static RT_API_ATTRS void LenTrim(Descriptor &result, const Descriptor &string,
  for (int j{0}; j < rank; ++j) {
    result.GetDimension(j).SetBounds(1, ub[j]);
  }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
    terminator.Crash("LEN_TRIM: could not allocate storage for result");
  }
  std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@ -427,7 +427,7 @@ static RT_API_ATTRS void GeneralCharFunc(Descriptor &result,
  for (int j{0}; j < rank; ++j) {
    result.GetDimension(j).SetBounds(1, ub[j]);
  }
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
    terminator.Crash("SCAN/VERIFY: could not allocate storage for result");
  }
  std::size_t stringElementChars{string.ElementBytes() >> shift<CHAR>};
@ -530,7 +530,8 @@ static RT_API_ATTRS void MaxMinHelper(Descriptor &accumulator,
    for (int j{0}; j < rank; ++j) {
      accumulator.GetDimension(j).SetBounds(1, ub[j]);
    }
-    RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+    RUNTIME_CHECK(
+        terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
  }
  for (CHAR *result{accumulator.OffsetElement<CHAR>()}; elements-- > 0;
       accumData += accumChars, result += chars, x.IncrementSubscripts(xAt)) {
@ -606,7 +607,7 @@ void RTDEF(CharacterConcatenate)(Descriptor &accumulator,
  for (int j{0}; j < rank; ++j) {
    accumulator.GetDimension(j).SetBounds(1, ub[j]);
  }
-  if (accumulator.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (accumulator.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
    terminator.Crash(
        "CharacterConcatenate: could not allocate storage for result");
  }
@ -629,7 +630,8 @@ void RTDEF(CharacterConcatenateScalar1)(
  accumulator.set_base_addr(nullptr);
  std::size_t oldLen{accumulator.ElementBytes()};
  accumulator.raw().elem_len += chars;
-  RUNTIME_CHECK(terminator, accumulator.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(
+      terminator, accumulator.Allocate(kNoAsyncObject) == CFI_SUCCESS);
  std::memcpy(accumulator.OffsetElement<char>(oldLen), from, chars);
  FreeMemory(old);
 }
@ -831,7 +833,7 @@ void RTDEF(Repeat)(Descriptor &result, const Descriptor &string,
  std::size_t origBytes{string.ElementBytes()};
  result.Establish(string.type(), origBytes * ncopies, nullptr, 0, nullptr,
      CFI_attribute_allocatable);
-  if (result.Allocate(kNoAsyncId) != CFI_SUCCESS) {
+  if (result.Allocate(kNoAsyncObject) != CFI_SUCCESS) {
    terminator.Crash("REPEAT could not allocate storage for result");
  }
  const char *from{string.OffsetElement()};
@ -865,7 +867,7 @@ void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
  }
  result.Establish(string.type(), resultBytes, nullptr, 0, nullptr,
      CFI_attribute_allocatable);
-  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncId) == CFI_SUCCESS);
+  RUNTIME_CHECK(terminator, result.Allocate(kNoAsyncObject) == CFI_SUCCESS);
  std::memcpy(result.OffsetElement(), string.OffsetElement(), resultBytes);
 }

--- a/flang-rt/lib/runtime/copy.cpp
+++ b/flang-rt/lib/runtime/copy.cpp
@ -171,8 +171,8 @@ RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[],
                *reinterpret_cast<Descriptor *>(toPtr + component->offset())};
            if (toDesc.raw().base_addr != nullptr) {
              toDesc.set_base_addr(nullptr);
-              RUNTIME_CHECK(
-                  terminator, toDesc.Allocate(/*asyncId=*/-1) == CFI_SUCCESS);
+              RUNTIME_CHECK(terminator,
+                  toDesc.Allocate(/*asyncObject=*/nullptr) == CFI_SUCCESS);
              const Descriptor &fromDesc{*reinterpret_cast<const Descriptor *>(
                  fromPtr + component->offset())};
              copyStack.emplace(toDesc, fromDesc);
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@ -52,7 +52,7 @@ RT_API_ATTRS int Initialize(const Descriptor &instance,
        allocDesc.raw().attribute = CFI_attribute_allocatable;
        if (comp.genre() == typeInfo::Component::Genre::Automatic) {
          stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
          if (stat == StatOk) {
            if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
              if (const auto *derived{addendum->derivedType()}) {
@ -153,7 +153,7 @@ RT_API_ATTRS int InitializeClone(const Descriptor &clone,
        if (origDesc.IsAllocated()) {
          cloneDesc.ApplyMold(origDesc, origDesc.rank());
          stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncId), errMsg, hasStat);
+              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
          if (stat == StatOk) {
            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
              if (const typeInfo::DerivedType *
@ -260,7 +260,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
        copy.raw().attribute = CFI_attribute_allocatable;
        Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
        RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncId) == CFI_SUCCESS);
+            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
        ShallowCopyDiscontiguousToContiguous(copy, descriptor);
        argDescriptor = &copy;
      }
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@ -158,7 +158,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
 #endif
 }

-RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
+RT_API_ATTRS int Descriptor::Allocate(std::int64_t *asyncObject) {
  std::size_t elementBytes{ElementBytes()};
  if (static_cast<std::int64_t>(elementBytes) < 0) {
    // F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@ -170,7 +170,7 @@ RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
  // Zero size allocation is possible in Fortran and the resulting
  // descriptor must be allocated/associated. Since std::malloc(0)
  // result is implementation defined, always allocate at least one byte.
-  void *p{alloc(byteSize ? byteSize : 1, asyncId)};
+  void *p{alloc(byteSize ? byteSize : 1, asyncObject)};
  if (!p) {
    return CFI_ERROR_MEM_ALLOCATION;
  }
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@ -152,7 +152,7 @@ inline RT_API_ATTRS void CharacterMaxOrMinLoc(const char *intrinsic,
      CFI_attribute_allocatable);
  result.GetDimension(0).SetBounds(1, extent[0]);
  Terminator terminator{source, line};
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
    terminator.Crash(
        "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
  }
@ -181,7 +181,7 @@ inline RT_API_ATTRS void TotalNumericMaxOrMinLoc(const char *intrinsic,
      CFI_attribute_allocatable);
  result.GetDimension(0).SetBounds(1, extent[0]);
  Terminator terminator{source, line};
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
    terminator.Crash(
        "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
  }
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@ -220,7 +220,7 @@ void RTDEF(Findloc)(Descriptor &result, const Descriptor &x,
      CFI_attribute_allocatable);
  result.GetDimension(0).SetBounds(1, extent[0]);
  Terminator terminator{source, line};
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
    terminator.Crash(
        "FINDLOC: could not allocate memory for result; STAT=%d", stat);
  }
--- a/flang-rt/lib/runtime/matmul-transpose.cpp
+++ b/flang-rt/lib/runtime/matmul-transpose.cpp
@ -183,7 +183,7 @@ inline static RT_API_ATTRS void DoMatmulTranspose(
    for (int j{0}; j < resRank; ++j) {
      result.GetDimension(j).SetBounds(1, extent[j]);
    }
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
      terminator.Crash(
          "MATMUL-TRANSPOSE: could not allocate memory for result; STAT=%d",
          stat);
--- a/flang-rt/lib/runtime/matmul.cpp
+++ b/flang-rt/lib/runtime/matmul.cpp
@ -255,7 +255,7 @@ static inline RT_API_ATTRS void DoMatmul(
    for (int j{0}; j < resRank; ++j) {
      result.GetDimension(j).SetBounds(1, extent[j]);
    }
-    if (int stat{result.Allocate(kNoAsyncId)}) {
+    if (int stat{result.Allocate(kNoAsyncObject)}) {
      terminator.Crash(
          "MATMUL: could not allocate memory for result; STAT=%d", stat);
    }
--- a/flang-rt/lib/runtime/misc-intrinsic.cpp
+++ b/flang-rt/lib/runtime/misc-intrinsic.cpp
@ -30,7 +30,7 @@ static RT_API_ATTRS void TransferImpl(Descriptor &result,
  if (const DescriptorAddendum * addendum{mold.Addendum()}) {
    *result.Addendum() = *addendum;
  }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
    Terminator{sourceFile, line}.Crash(
        "TRANSFER: could not allocate memory for result; STAT=%d", stat);
  }
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@ -129,7 +129,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
  byteSize = ((byteSize + align - 1) / align) * align;
  std::size_t total{byteSize + sizeof(std::uintptr_t)};
  AllocFct alloc{allocatorRegistry.GetAllocator(allocatorIdx)};
-  void *p{alloc(total, /*asyncId=*/-1)};
+  void *p{alloc(total, /*asyncObject=*/nullptr)};
  if (p && allocatorIdx == 0) {
    // Fill the footer word with the XOR of the ones' complement of
    // the base address, which is a value that would be highly unlikely
--- a/flang-rt/lib/runtime/temporary-stack.cpp
+++ b/flang-rt/lib/runtime/temporary-stack.cpp
@ -148,7 +148,7 @@ void DescriptorStorage<COPY_VALUES>::push(const Descriptor &source) {
  if constexpr (COPY_VALUES) {
    // copy the data pointed to by the box
    box.set_base_addr(nullptr);
-    box.Allocate(kNoAsyncId);
+    box.Allocate(kNoAsyncObject);
    RTNAME(AssignTemporary)
    (box, source, terminator_.sourceFileName(), terminator_.sourceLine());
  }
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@ -261,7 +261,7 @@ RT_API_ATTRS void CreatePartialReductionResult(Descriptor &result,
  for (int j{0}; j + 1 < xRank; ++j) {
    result.GetDimension(j).SetBounds(1, resultExtent[j]);
  }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
    terminator.Crash(
        "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
  }
--- a/flang-rt/lib/runtime/transformational.cpp
+++ b/flang-rt/lib/runtime/transformational.cpp
@ -132,7 +132,7 @@ static inline RT_API_ATTRS std::size_t AllocateResult(Descriptor &result,
  for (int j{0}; j < rank; ++j) {
    result.GetDimension(j).SetBounds(1, extent[j]);
  }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
    terminator.Crash(
        "%s: Could not allocate memory for result (stat=%d)", function, stat);
  }
@ -157,7 +157,7 @@ static inline RT_API_ATTRS std::size_t AllocateBesselResult(Descriptor &result,
  for (int j{0}; j < rank; ++j) {
    result.GetDimension(j).SetBounds(1, extent[j]);
  }
-  if (int stat{result.Allocate(kNoAsyncId)}) {
+  if (int stat{result.Allocate(kNoAsyncObject)}) {
    terminator.Crash(
        "%s: Could not allocate memory for result (stat=%d)", function, stat);
  }
--- a/flang-rt/unittests/Evaluate/reshape.cpp
+++ b/flang-rt/unittests/Evaluate/reshape.cpp
@ -26,7 +26,7 @@ int main() {
  for (int j{0}; j < 3; ++j) {
    source->GetDimension(j).SetBounds(1, sourceExtent[j]);
  }
-  TEST(source->Allocate(kNoAsyncId) == CFI_SUCCESS);
+  TEST(source->Allocate(kNoAsyncObject) == CFI_SUCCESS);
  TEST(source->IsAllocated());
  MATCH(2, source->GetDimension(0).Extent());
  MATCH(3, source->GetDimension(1).Extent());
--- a/flang-rt/unittests/Runtime/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/Allocatable.cpp
@ -26,7 +26,7 @@ TEST(AllocatableTest, MoveAlloc) {
  auto b{createAllocatable(TypeCategory::Integer, 4)};
  // ALLOCATE(a(20))
  a->GetDimension(0).SetBounds(1, 20);
-  a->Allocate(kNoAsyncId);
+  a->Allocate(kNoAsyncObject);

  EXPECT_TRUE(a->IsAllocated());
  EXPECT_FALSE(b->IsAllocated());
@ -46,7 +46,7 @@ TEST(AllocatableTest, MoveAlloc) {
  // move_alloc with errMsg
  auto errMsg{Descriptor::Create(
      sizeof(char), 64, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  errMsg->Allocate(kNoAsyncId);
+  errMsg->Allocate(kNoAsyncObject);
  RTNAME(MoveAlloc)(*b, *a, nullptr, false, errMsg.get(), __FILE__, __LINE__);
  EXPECT_FALSE(a->IsAllocated());
  EXPECT_TRUE(b->IsAllocated());
--- a/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Allocatable.cpp
@ -42,7 +42,8 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocatable) {
  CUDA_REPORT_IF_ERROR(cudaMalloc(&device_desc, a->SizeInBytes()));

  RTNAME(AllocatableAllocate)
-  (*a, kNoAsyncId, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  (*a, kNoAsyncObject, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
  EXPECT_TRUE(a->IsAllocated());
  RTNAME(CUFDescriptorSync)(device_desc, a.get(), __FILE__, __LINE__);
  cudaDeviceSynchronize();
@ -82,19 +83,22 @@ TEST(AllocatableCUFTest, StreamDeviceAllocatable) {
  RTNAME(AllocatableSetBounds)(*c, 0, 1, 100);

  RTNAME(AllocatableAllocate)
-  (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
  EXPECT_TRUE(a->IsAllocated());
  cudaDeviceSynchronize();
  EXPECT_EQ(cudaSuccess, cudaGetLastError());

  RTNAME(AllocatableAllocate)
-  (*b, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  (*b, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
  EXPECT_TRUE(b->IsAllocated());
  cudaDeviceSynchronize();
  EXPECT_EQ(cudaSuccess, cudaGetLastError());

  RTNAME(AllocatableAllocate)
-  (*c, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__);
+  (*c, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+      __LINE__);
  EXPECT_TRUE(c->IsAllocated());
  cudaDeviceSynchronize();
  EXPECT_EQ(cudaSuccess, cudaGetLastError());
--- a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
@ -35,7 +35,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
  EXPECT_FALSE(a->HasAddendum());
  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
  RTNAME(AllocatableAllocate)
-  (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+  (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
      __LINE__);
  EXPECT_TRUE(a->IsAllocated());
  RTNAME(AllocatableDeallocate)
@ -54,7 +54,7 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
  EXPECT_FALSE(a->HasAddendum());
  RTNAME(AllocatableSetBounds)(*a, 0, 1, 10);
  RTNAME(AllocatableAllocate)
-  (*a, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
+  (*a, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
      __LINE__);
  EXPECT_TRUE(a->IsAllocated());
  RTNAME(AllocatableDeallocate)
--- a/flang-rt/unittests/Runtime/CUDA/Memory.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Memory.cpp
@ -50,8 +50,8 @@ TEST(MemoryCUFTest, CUFDataTransferDescDesc) {
  EXPECT_EQ((int)kDeviceAllocatorPos, dev->GetAllocIdx());
  RTNAME(AllocatableSetBounds)(*dev, 0, 1, 10);
  RTNAME(AllocatableAllocate)
-  (*dev, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__,
-      __LINE__);
+  (*dev, /*asyncObject=*/nullptr, /*hasStat=*/false, /*errMsg=*/nullptr,
+      __FILE__, __LINE__);
  EXPECT_TRUE(dev->IsAllocated());

  // Create temp array to transfer to device.
--- a/flang-rt/unittests/Runtime/CharacterTest.cpp
+++ b/flang-rt/unittests/Runtime/CharacterTest.cpp
@ -35,7 +35,7 @@ OwningPtr<Descriptor> CreateDescriptor(const std::vector<SubscriptValue> &shape,
  for (int j{0}; j < rank; ++j) {
    descriptor->GetDimension(j).SetBounds(2, shape[j] + 1);
  }
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
    return nullptr;
  }

--- a/flang-rt/unittests/Runtime/CommandTest.cpp
+++ b/flang-rt/unittests/Runtime/CommandTest.cpp
@ -26,7 +26,7 @@ template <std::size_t n = 64>
 static OwningPtr<Descriptor> CreateEmptyCharDescriptor() {
  OwningPtr<Descriptor> descriptor{Descriptor::Create(
      sizeof(char), n, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
    return nullptr;
  }
  return descriptor;
@ -36,7 +36,7 @@ static OwningPtr<Descriptor> CharDescriptor(const char *value) {
  std::size_t n{std::strlen(value)};
  OwningPtr<Descriptor> descriptor{Descriptor::Create(
      sizeof(char), n, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
    return nullptr;
  }
  std::memcpy(descriptor->OffsetElement(), value, n);
@ -47,7 +47,7 @@ template <int kind = sizeof(std::int64_t)>
 static OwningPtr<Descriptor> EmptyIntDescriptor() {
  OwningPtr<Descriptor> descriptor{Descriptor::Create(TypeCategory::Integer,
      kind, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
    return nullptr;
  }
  return descriptor;
@ -57,7 +57,7 @@ template <int kind = sizeof(std::int64_t)>
 static OwningPtr<Descriptor> IntDescriptor(const int &value) {
  OwningPtr<Descriptor> descriptor{Descriptor::Create(TypeCategory::Integer,
      kind, nullptr, 0, nullptr, CFI_attribute_allocatable)};
-  if (descriptor->Allocate(kNoAsyncId) != 0) {
+  if (descriptor->Allocate(kNoAsyncObject) != 0) {
    return nullptr;
  }
  std::memcpy(descriptor->OffsetElement<int>(), &value, sizeof(int));
--- a/flang-rt/unittests/Runtime/TemporaryStack.cpp
+++ b/flang-rt/unittests/Runtime/TemporaryStack.cpp
@ -59,7 +59,7 @@ TEST(TemporaryStack, ValueStackBasic) {
  Descriptor &outputDesc2{testDescriptorStorage[2].descriptor()};
  inputDesc.Establish(code, elementBytes, descriptorPtr, rank, extent);

-  inputDesc.Allocate(kNoAsyncId);
+  inputDesc.Allocate(kNoAsyncObject);
  ASSERT_EQ(inputDesc.IsAllocated(), true);
  uint32_t *inputData = static_cast<uint32_t *>(inputDesc.raw().base_addr);
  for (std::size_t i = 0; i < inputDesc.Elements(); ++i) {
@ -123,7 +123,7 @@ TEST(TemporaryStack, ValueStackMultiSize) {
      boxDims.extent = extent[dim];
      boxDims.sm = elementBytes;
    }
-    desc->Allocate(kNoAsyncId);
+    desc->Allocate(kNoAsyncObject);

    // fill the array with some data to test
    for (uint32_t i = 0; i < desc->Elements(); ++i) {
--- a/flang-rt/unittests/Runtime/tools.h
+++ b/flang-rt/unittests/Runtime/tools.h
@ -42,7 +42,7 @@ static OwningPtr<Descriptor> MakeArray(const std::vector<int> &shape,
  for (int j{0}; j < rank; ++j) {
    result->GetDimension(j).SetBounds(1, shape[j]);
  }
-  int stat{result->Allocate(kNoAsyncId)};
+  int stat{result->Allocate(kNoAsyncObject)};
  EXPECT_EQ(stat, 0) << stat;
  EXPECT_LE(data.size(), result->Elements());
  char *p{result->OffsetElement<char>()};
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@ -95,12 +95,11 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
  }];

  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
-                       Optional<AnyIntegerType>:$stream,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
-                       cuf_DataAttributeAttr:$data_attr,
-                       UnitAttr:$hasStat);
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+      Optional<fir_ReferenceType>:$stream,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
+      cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat);

  let results = (outs AnyIntegerType:$stat);

--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@ -17,14 +17,14 @@ namespace Fortran::runtime::cuda {
 extern "C" {

 /// Perform allocation of the descriptor.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
    bool *pinned = nullptr, bool hasStat = false,
    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
    int sourceLine = 0);

 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
    bool *pinned = nullptr, bool hasStat = false,
    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
    int sourceLine = 0);
@ -32,14 +32,14 @@ int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
    bool hasStat = false, const Descriptor *errMsg = nullptr,
    const char *sourceFile = nullptr, int sourceLine = 0);

 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
    bool hasStat = false, const Descriptor *errMsg = nullptr,
    const char *sourceFile = nullptr, int sourceLine = 0);

--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@ -20,16 +20,16 @@ extern "C" {
 void RTDECL(CUFRegisterAllocator)();
 }

-void *CUFAllocPinned(std::size_t, std::int64_t);
+void *CUFAllocPinned(std::size_t, std::int64_t *);
 void CUFFreePinned(void *);

-void *CUFAllocDevice(std::size_t, std::int64_t);
+void *CUFAllocDevice(std::size_t, std::int64_t *);
 void CUFFreeDevice(void *);

-void *CUFAllocManaged(std::size_t, std::int64_t);
+void *CUFAllocManaged(std::size_t, std::int64_t *);
 void CUFFreeManaged(void *);

-void *CUFAllocUnified(std::size_t, std::int64_t);
+void *CUFAllocUnified(std::size_t, std::int64_t *);
 void CUFFreeUnified(void *);

 } // namespace Fortran::runtime::cuda
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@ -17,14 +17,14 @@ namespace Fortran::runtime::cuda {
 extern "C" {

 /// Perform allocation of the descriptor.
-int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
    bool *pinned = nullptr, bool hasStat = false,
    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
    int sourceLine = 0);

 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
+int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
    bool *pinned = nullptr, bool hasStat = false,
    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
    int sourceLine = 0);
@ -32,14 +32,14 @@ int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
    bool hasStat = false, const Descriptor *errMsg = nullptr,
    const char *sourceFile = nullptr, int sourceLine = 0);

 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
    bool hasStat = false, const Descriptor *errMsg = nullptr,
    const char *sourceFile = nullptr, int sourceLine = 0);

--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@ -94,9 +94,10 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the allocatable has a
 // derived type, and is always initialized by AllocatableAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
-int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+int RTDECL(AllocatableAllocate)(Descriptor &,
+    std::int64_t *asyncObject = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
    bool hasStat = false, const Descriptor *errMsg = nullptr,
    const char *sourceFile = nullptr, int sourceLine = 0);
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@ -760,7 +760,7 @@ private:
    mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
    mlir::Value stream =
        streamExpr
-            ? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
+            ? fir::getBase(converter.genExprAddr(loc, *streamExpr, stmtCtx))
            : nullptr;
    mlir::Value pinned =
        pinnedExpr
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@ -76,8 +76,7 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
  mlir::func::FuncOp func{
      fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc, builder)};
  mlir::FunctionType fTy{func.getFunctionType()};
-  mlir::Value asyncId =
-      builder.createIntegerConstant(loc, builder.getI64Type(), -1);
+  mlir::Value asyncObject = builder.createNullConstant(loc);
  mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)};
  mlir::Value sourceLine{
      fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))};
@ -88,7 +87,7 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
    errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
  }
  llvm::SmallVector<mlir::Value> args{
-      fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat,
-                                    errMsg, sourceFile, sourceLine)};
+      fir::runtime::createArguments(builder, loc, fTy, desc, asyncObject,
+                                    hasStat, errMsg, sourceFile, sourceLine)};
  builder.create<fir::CallOp>(loc, func, args);
 }
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@ -76,6 +76,16 @@ llvm::LogicalResult cuf::FreeOp::verify() { return checkCudaAttr(*this); }
 // AllocateOp
 //===----------------------------------------------------------------------===//

+template <typename OpTy>
+static llvm::LogicalResult checkStreamType(OpTy op) {
+  if (!op.getStream())
+    return mlir::success();
+  if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
+    if (!refTy.getEleTy().isInteger(64))
+      return op.emitOpError("stream is expected to be an i64 reference");
+  return mlir::success();
+}
+
 llvm::LogicalResult cuf::AllocateOp::verify() {
  if (getPinned() && getStream())
    return emitOpError("pinned and stream cannot appears at the same time");
@ -92,7 +102,7 @@ llvm::LogicalResult cuf::AllocateOp::verify() {
        "expect errmsg to be a reference to/or a box type value");
  if (getErrmsg() && !getHasStat())
    return emitOpError("expect stat attribute when errmsg is provided");
-  return mlir::success();
+  return checkStreamType(*this);
 }

 //===----------------------------------------------------------------------===//
@ -143,16 +153,6 @@ llvm::LogicalResult cuf::DeallocateOp::verify() {
 // KernelLaunchOp
 //===----------------------------------------------------------------------===//

-template <typename OpTy>
-static llvm::LogicalResult checkStreamType(OpTy op) {
-  if (!op.getStream())
-    return mlir::success();
-  if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
-    if (!refTy.getEleTy().isInteger(64))
-      return op.emitOpError("stream is expected to be an i64 reference");
-  return mlir::success();
-}
-
 llvm::LogicalResult cuf::KernelLaunchOp::verify() {
  return checkStreamType(*this);
 }
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@ -129,17 +129,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
                           mlir::IntegerType::get(op.getContext(), 1)));
    if (op.getSource()) {
      mlir::Value stream =
-          op.getStream()
-              ? op.getStream()
-              : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
+          op.getStream() ? op.getStream()
+                         : builder.createNullConstant(loc, fTy.getInput(2));
      args = fir::runtime::createArguments(
          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
          hasStat, errmsg, sourceFile, sourceLine);
    } else {
      mlir::Value stream =
-          op.getStream()
-              ? op.getStream()
-              : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
+          op.getStream() ? op.getStream()
+                         : builder.createNullConstant(loc, fTy.getInput(1));
      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
                                           stream, pinned, hasStat, errmsg,
                                           sourceFile, sourceLine);
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)

 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }

 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32


@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32


 func.func @_QPallocate_source() {
@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32


 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@ -170,16 +170,14 @@ func.func @_QQallocate_stream() {
  %1 = fir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
  %2 = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
  %3 = fir.declare %2 {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
-  %4 = fir.load %3 : !fir.ref<i64>
-  %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%4 : i64) {data_attr = #cuf.cuda<device>} -> i32
+  %5 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> stream(%3 : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32
  return
 }

 // CHECK-LABEL: func.func @_QQallocate_stream() 
 // CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
 // CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
-// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32


 func.func @_QPp_alloc() {
@ -268,6 +266,6 @@ func.func @_QQpinned() attributes {fir.bindc_name = "testasync"} {
 // CHECK: %[[PINNED:.*]] = fir.alloca !fir.logical<4> {bindc_name = "pinnedflag", uniq_name = "_QFEpinnedflag"}
 // CHECK: %[[DECL_PINNED:.*]] = fir.declare %[[PINNED]] {uniq_name = "_QFEpinnedflag"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
 // CHECK: %[[CONV_PINNED:.*]] = fir.convert %[[DECL_PINNED]] : (!fir.ref<!fir.logical<4>>) -> !fir.ref<i1>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 } // end of module
--- a/flang/test/Fir/cuf-invalid.fir
+++ b/flang/test/Fir/cuf-invalid.fir
@ -2,13 +2,12 @@

 func.func @_QPsub1() {
  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %1 = fir.alloca i32
+  %s = fir.alloca i64
  %pinned = fir.alloca i1
  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  %s = fir.load %1 : !fir.ref<i32>
  // expected-error@+1{{'cuf.allocate' op pinned and stream cannot appears at the same time}}
-  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) pinned(%pinned : !fir.ref<i1>) {data_attr = #cuf.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : !fir.ref<i64>) pinned(%pinned : !fir.ref<i1>) {data_attr = #cuf.cuda<device>} -> i32
  return
 }

--- a/flang/test/Fir/cuf.mlir
+++ b/flang/test/Fir/cuf.mlir
@ -18,15 +18,14 @@ func.func @_QPsub1() {

 func.func @_QPsub1() {
  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %1 = fir.alloca i32
+  %1 = fir.alloca i64
  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  %s = fir.load %1 : !fir.ref<i32>
-  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) {data_attr = #cuf.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%1 : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32
  return
 }

-// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : i32) {data_attr = #cuf.cuda<device>} -> i32
+// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32

 // -----

--- a/flang/test/HLFIR/elemental-codegen.fir
+++ b/flang/test/HLFIR/elemental-codegen.fir
@ -191,7 +191,7 @@ func.func @test_polymorphic(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.bindc_
 // CHECK:           %[[VAL_35:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_12:.*]] = arith.constant true
 // CHECK:           %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_40:.*]] = arith.constant 1 : index
@ -275,7 +275,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_36:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_13:.*]] = arith.constant true
 // CHECK:           %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_41:.*]] = arith.constant 1 : index
@ -328,7 +328,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
 // CHECK:           %[[VAL_85:.*]] = fir.absent !fir.box<none>
 // CHECK:           %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:           %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:           %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:           %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:           %[[VAL_62:.*]] = arith.constant true
 // CHECK:           %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
 // CHECK:           %[[VAL_90:.*]] = arith.constant 1 : index
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@ -90,7 +90,7 @@ end subroutine

 subroutine sub4()
  real, allocatable, device :: a(:)
-  integer :: istream
+  integer(8) :: istream
  allocate(a(10), stream=istream)
 end subroutine

@ -98,11 +98,10 @@ end subroutine
 ! CHECK: %[[BOX:.*]] = cuf.alloc !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub4Ea"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: fir.embox {{.*}} {allocator_idx = 2 : i32}
 ! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
-! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ISTREAM:.*]] = fir.alloca i64 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
+! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
-! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
-! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {data_attr = #cuf.cuda<device>} -> i32
+! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[ISTREAM_DECL]]#0 : !fir.ref<i64>) {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: fir.if %{{.*}} {
 ! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: }
--- a/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
+++ b/flang/test/Lower/OpenACC/acc-declare-unwrap-defaultbounds.f90
@ -473,6 +473,6 @@ contains
 end module

 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@ -434,6 +434,6 @@ contains
 end module

 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@ -267,7 +267,7 @@ contains
 ! CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK-DAG: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@ -276,7 +276,7 @@ contains
 ! CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C1_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK-DAG: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@ -285,7 +285,7 @@ contains
 ! CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C2_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK-DAG: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@ -300,7 +300,7 @@ contains
 ! CHECK-DAG: %[[C10_I64:.*]] = fir.convert %[[C10]] : (i32) -> i64
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C3_CAST]], %[[C0]], %[[C1_I64]], %[[C10_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK-DAG: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@ -316,7 +316,7 @@ contains
 ! CHECK-DAG: %[[C20_I64:.*]] = fir.convert %[[C20]] : (i32) -> i64
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C4_CAST]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 ! CHECK: %[[C1_LOAD1:.*]] = fir.load %[[C1_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: fir.dispatch "proc1"(%[[C1_LOAD1]] : !fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>)
@ -390,7 +390,7 @@ contains
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 ! CHECK-DAG: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK-DAG: %[[CAT:.*]] = arith.constant 2 : i32
@ -573,7 +573,7 @@ contains
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[A_NONE]], %[[LEN]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
 ! CHECK: %[[A_NONE:.*]] = fir.convert %[[A_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 end module

@ -592,17 +592,17 @@ end
 ! LLVM-LABEL: define void @_QMpolyPtest_allocatable()

 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp1, i32 1, i32 0)
 ! LLVM: call void @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 10)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %{{.*}}, ptr @_QMpolyEXdtXp2, i32 1, i32 0)
 ! LLVM: call void @_FortranAAllocatableSetBounds(ptr %{{.*}}, i32 0, i64 1, i64 20)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %{{.*}}, ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM-COUNT-2:  call void %{{[0-9]*}}()

 ! LLVM: call void @llvm.memcpy.p0.p0.i32
@ -683,5 +683,5 @@ end
 ! LLVM: store { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] } { ptr null, i64 8, i32 20240719, i8 0, i8 42, i8 2, i8 1, ptr @_QMpolyEXdtXp1, [1 x i64] zeroinitializer }, ptr %[[ALLOCA1:[0-9]*]]
 ! LLVM: call void @llvm.memcpy.p0.p0.i32(ptr %[[ALLOCA2:[0-9]+]], ptr %[[ALLOCA1]], i32 40, i1 false)
 ! LLVM: call void @_FortranAAllocatableInitDerivedForAllocate(ptr %[[ALLOCA2]], ptr @_QMpolyEXdtXp1, i32 0, i32 0)
-! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], i64 {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
+! LLVM: %{{.*}} = call i32 @_FortranAAllocatableAllocate(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
 ! LLVM: %{{.*}} = call i32 @_FortranAAllocatableDeallocatePolymorphic(ptr %[[ALLOCA2]], ptr {{.*}}, i1 false, ptr null, ptr @_QQclX{{.*}}, i32 {{.*}})
--- a/flang/test/Lower/allocatable-runtime.f90
+++ b/flang/test/Lower/allocatable-runtime.f90
@ -31,7 +31,7 @@ subroutine foo()
  ! CHECK: fir.call @{{.*}}AllocatableSetBounds(%[[xBoxCast2]], %c0{{.*}}, %[[xlbCast]], %[[xubCast]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
  ! CHECK-DAG: %[[xBoxCast3:.*]] = fir.convert %[[xBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
  ! CHECK-DAG: %[[sourceFile:.*]] = fir.convert %{{.*}} -> !fir.ref<i8>
-  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

  ! Simply check that we are emitting the right numebr of set bound for y and z. Otherwise, this is just like x.
  ! CHECK: fir.convert %[[yBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
@ -180,4 +180,4 @@ end subroutine
 ! CHECK: %[[M_BOX_NONE:.*]] = fir.convert %[[EMBOX_M]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
 ! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[M_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
--- a/flang/test/Lower/allocate-mold.f90
+++ b/flang/test/Lower/allocate-mold.f90
@ -16,7 +16,7 @@ end subroutine
 ! CHECK: %[[A_REF_BOX_NONE1:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_REF_BOX_NONE1]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_REF_BOX_NONE2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32

 subroutine array_scalar_mold_allocation()
  real, allocatable :: a(:)
@ -40,4 +40,4 @@ end subroutine array_scalar_mold_allocation
 ! CHECK: %[[REF_BOX_A1:.*]] = fir.convert %1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[REF_BOX_A1]], {{.*}},{{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[REF_BOX_A2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@ -1149,7 +1149,7 @@ end program
 ! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} {
 ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK: %[[O:.*]] = fir.load %[[ADDR_O]] : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[COORD_INNER:.*]] = fir.coordinate_of %[[O]], inner : (!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>) -> !fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %{{.*}}) -> (!fir.array<5x!fir.logical<4>>) {
--- a/flang/test/Lower/volatile-allocatable.f90
+++ b/flang/test/Lower/volatile-allocatable.f90
@ -124,15 +124,15 @@ end subroutine
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_scalar_volatileEv2"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_scalar_volatileEv3"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_scalar_volatileEv1"} : (!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>) -> (!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, !fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>)
 ! CHECK:           %{{.+}} = hlfir.designate %{{.+}}#0{"j"}   : (!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>) -> !fir.ref<i32, volatile>
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QMderived_typesText_type.0"} : (!fir.ref<!fir.type<{{.*}}>>) -> (!fir.ref<!fir.type<{{.*}}>>, !fir.ref<!fir.type<{{.*}}>>)
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocateSource(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} typeparams %{{.+}} {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX766F6C6174696C6520636861726163746572"} : (!fir.ref<!fir.char<1,18>>, index) -> (!fir.ref<!fir.char<1,18>>, !fir.ref<!fir.char<1,18>>)
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocatePolymorphic(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@ -144,7 +144,7 @@ end subroutine
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, asynchronous, volatile>, uniq_name = "_QFtest_volatile_asynchronousEv1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro.4xi4.1"} : (!fir.ref<!fir.array<4xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<4xi32>>)
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocateSource(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@ -154,7 +154,7 @@ end subroutine
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_select_base_type_volatileEv"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAClassIs(%{{.+}}, %{{.+}}) : (!fir.box<none>, !fir.ref<none>) -> i1
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_select_base_type_volatileEv"} : (!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, !fir.shift<1>) -> (!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, !fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>)
 ! CHECK:           %{{.+}} = hlfir.designate %{{.+}}#0 (%{{.+}})  : (!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, index) -> !fir.class<!fir.type<{{.*}}>, volatile>
@ -170,22 +170,22 @@ end subroutine
 ! CHECK:           %{{.+}} = hlfir.designate %{{.+}}#0{"arr"}   shape %{{.+}} : (!fir.ref<!fir.type<{{.*}}>>, !fir.shape<1>) -> !fir.ref<!fir.array<2xi32>>
 ! CHECK:           fir.call @_FortranAAllocatableApplyMold(%{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK-LABEL:   func.func @_QPtest_unlimited_polymorphic() {
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_unlimited_polymorphicEup"} : (!fir.ref<!fir.class<!fir.heap<none>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<none>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<none>, volatile>, volatile>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_unlimited_polymorphicEupa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_unlimited_polymorphicEup"} : (!fir.heap<i32>) -> (!fir.heap<i32>, !fir.heap<i32>)
 ! CHECK:           fir.call @_FortranAAllocatableInitCharacterForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} typeparams %{{.+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_unlimited_polymorphicEup"} : (!fir.heap<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.heap<!fir.char<1,?>>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} typeparams %{{.+}} {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX636C617373282A29"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>)
 ! CHECK:           fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_unlimited_polymorphicEupa"} : (!fir.box<!fir.heap<!fir.array<?xf32>>, volatile>, !fir.shift<1>) -> (!fir.box<!fir.heap<!fir.array<?xf32>>, volatile>, !fir.box<!fir.heap<!fir.array<?xf32>>, volatile>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro.3xr4.3"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocatePolymorphic(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
--- a/flang/test/Transforms/lower-repack-arrays.fir
+++ b/flang/test/Transforms/lower-repack-arrays.fir
@ -840,7 +840,7 @@ func.func @_QPtest6(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bi
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
@ -928,7 +928,7 @@ func.func @_QPtest6_stack(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.box<none>
@ -1015,7 +1015,7 @@ func.func @_QPtest7(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_name = "x
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>
@ -1103,7 +1103,7 @@ func.func @_QPtest7_stack(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_nam
 // CHECK:               %[[VAL_34:.*]] = fir.absent !fir.box<none>
 // CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %{{.*}}, %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}}
 // CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.box<none>