llvm-project/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp

//===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "GPUOpsLowering.h"

#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinTypes.h"
#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Support/FormatVariadic.h"

using namespace mlir;

LLVM::LLVMFuncOp mlir::getOrDefineFunction(gpu::GPUModuleOp moduleOp,
                                           Location loc, OpBuilder &b,
                                           StringRef name,
                                           LLVM::LLVMFunctionType type) {
  LLVM::LLVMFuncOp ret;
  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
    OpBuilder::InsertionGuard guard(b);
    b.setInsertionPointToStart(moduleOp.getBody());
    ret = LLVM::LLVMFuncOp::create(b, loc, name, type, LLVM::Linkage::External);
  }
  return ret;
}

static SmallString<16> getUniqueSymbolName(gpu::GPUModuleOp moduleOp,
                                           StringRef prefix) {
  // Get a unique global name.
  unsigned stringNumber = 0;
  SmallString<16> stringConstName;
  do {
    stringConstName.clear();
    (prefix + Twine(stringNumber++)).toStringRef(stringConstName);
  } while (moduleOp.lookupSymbol(stringConstName));
  return stringConstName;
}

LLVM::GlobalOp
mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
                                gpu::GPUModuleOp moduleOp, Type llvmI8,
                                StringRef namePrefix, StringRef str,
                                uint64_t alignment, unsigned addrSpace) {
  llvm::SmallString<20> nullTermStr(str);
  nullTermStr.push_back('\0'); // Null terminate for C
  auto globalType =
      LLVM::LLVMArrayType::get(llvmI8, nullTermStr.size_in_bytes());
  StringAttr attr = b.getStringAttr(nullTermStr);

  // Try to find existing global.
  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
    if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
        globalOp.getValueAttr() == attr &&
        globalOp.getAlignment().value_or(0) == alignment &&
        globalOp.getAddrSpace() == addrSpace)
      return globalOp;

  // Not found: create new global.
  OpBuilder::InsertionGuard guard(b);
  b.setInsertionPointToStart(moduleOp.getBody());
  SmallString<16> name = getUniqueSymbolName(moduleOp, namePrefix);
  return LLVM::GlobalOp::create(b, loc, globalType,
                                /*isConstant=*/true, LLVM::Linkage::Internal,
                                name, attr, alignment, addrSpace);
}

LogicalResult
GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
                                   ConversionPatternRewriter &rewriter) const {
  Location loc = gpuFuncOp.getLoc();

  SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
  if (encodeWorkgroupAttributionsAsArguments) {
    // Append an `llvm.ptr` argument to the function signature to encode
    // workgroup attributions.

    ArrayRef<BlockArgument> workgroupAttributions =
        gpuFuncOp.getWorkgroupAttributions();
    size_t numAttributions = workgroupAttributions.size();

    // Insert all arguments at the end.
    unsigned index = gpuFuncOp.getNumArguments();
    SmallVector<unsigned> argIndices(numAttributions, index);

    // New arguments will simply be `llvm.ptr` with the correct address space
    Type workgroupPtrType =
        rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
    SmallVector<Type> argTypes(numAttributions, workgroupPtrType);

    // Attributes: noalias, llvm.mlir.workgroup_attribution(<size>, <type>)
    std::array attrs{
        rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
                              rewriter.getUnitAttr()),
        rewriter.getNamedAttr(
            getDialect().getWorkgroupAttributionAttrHelper().getName(),
            rewriter.getUnitAttr()),
    };
    SmallVector<DictionaryAttr> argAttrs;
    for (BlockArgument attribution : workgroupAttributions) {
      auto attributionType = cast<MemRefType>(attribution.getType());
      IntegerAttr numElements =
          rewriter.getI64IntegerAttr(attributionType.getNumElements());
      Type llvmElementType =
          getTypeConverter()->convertType(attributionType.getElementType());
      if (!llvmElementType)
        return failure();
      TypeAttr type = TypeAttr::get(llvmElementType);
      attrs.back().setValue(
          rewriter.getAttr<LLVM::WorkgroupAttributionAttr>(numElements, type));
      argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
    }

    // Location match function location
    SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());

    // Perform signature modification
    rewriter.modifyOpInPlace(
        gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
          LogicalResult inserted =
              static_cast<FunctionOpInterface>(gpuFuncOp).insertArguments(
                  argIndices, argTypes, argAttrs, argLocs);
          (void)inserted;
          assert(succeeded(inserted) &&
                 "expected GPU funcs to support inserting any argument");
        });
  } else {
    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
    for (auto [idx, attribution] :
         llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
      auto type = dyn_cast<MemRefType>(attribution.getType());
      assert(type && type.hasStaticShape() && "unexpected type in attribution");

      uint64_t numElements = type.getNumElements();

      auto elementType =
          cast<Type>(typeConverter->convertType(type.getElementType()));
      auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
      std::string name =
          std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
      uint64_t alignment = 0;
      if (auto alignAttr = dyn_cast_or_null<IntegerAttr>(
              gpuFuncOp.getWorkgroupAttributionAttr(
                  idx, LLVM::LLVMDialect::getAlignAttrName())))
        alignment = alignAttr.getInt();
      auto globalOp = LLVM::GlobalOp::create(
          rewriter, gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
          LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
          workgroupAddrSpace);
      workgroupBuffers.push_back(globalOp);
    }
  }

  // Remap proper input types.
  TypeConverter::SignatureConversion signatureConversion(
      gpuFuncOp.front().getNumArguments());

  Type funcType = getTypeConverter()->convertFunctionSignature(
      gpuFuncOp.getFunctionType(), /*isVariadic=*/false,
      getTypeConverter()->getOptions().useBarePtrCallConv, signatureConversion);
  if (!funcType) {
    return rewriter.notifyMatchFailure(gpuFuncOp, [&](Diagnostic &diag) {
      diag << "failed to convert function signature type for: "
           << gpuFuncOp.getFunctionType();
    });
  }

  // Create the new function operation. Only copy those attributes that are
  // not specific to function modeling.
  SmallVector<NamedAttribute, 4> attributes;
  ArrayAttr argAttrs;
  for (const auto &attr : gpuFuncOp->getAttrs()) {
    if (attr.getName() == SymbolTable::getSymbolAttrName() ||
        attr.getName() == gpuFuncOp.getFunctionTypeAttrName() ||
        attr.getName() ==
            gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
        attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
        attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
        attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
        attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
      continue;
    if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
      argAttrs = gpuFuncOp.getArgAttrsAttr();
      continue;
    }
    attributes.push_back(attr);
  }

  DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
  DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
  // Ensure we don't lose information if the function is lowered before its
  // surrounding context.
  auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
  if (knownBlockSize)
    attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
                            knownBlockSize);
  if (knownGridSize)
    attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
                            knownGridSize);

  // Add a dialect specific kernel attribute in addition to GPU kernel
  // attribute. The former is necessary for further translation while the
  // latter is expected by gpu.launch_func.
  if (gpuFuncOp.isKernel()) {
    if (kernelAttributeName)
      attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
    // Set the dialect-specific block size attribute if there is one.
    if (kernelBlockSizeAttributeName && knownBlockSize) {
      attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
    }
  }
  LLVM::CConv callingConvention = gpuFuncOp.isKernel()
                                      ? kernelCallingConvention
                                      : nonKernelCallingConvention;
  auto llvmFuncOp = LLVM::LLVMFuncOp::create(
      rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
      LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
      /*comdat=*/nullptr, attributes);

  {
    // Insert operations that correspond to converted workgroup and private
    // memory attributions to the body of the function. This must operate on
    // the original function, before the body region is inlined in the new
    // function to maintain the relation between block arguments and the
    // parent operation that assigns their semantics.
    OpBuilder::InsertionGuard guard(rewriter);

    // Rewrite workgroup memory attributions to addresses of global buffers.
    rewriter.setInsertionPointToStart(&gpuFuncOp.front());
    unsigned numProperArguments = gpuFuncOp.getNumArguments();

    if (encodeWorkgroupAttributionsAsArguments) {
      // Build a MemRefDescriptor with each of the arguments added above.

      unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
      assert(numProperArguments >= numAttributions &&
             "Expecting attributions to be encoded as arguments already");

      // Arguments encoding workgroup attributions will be in positions
      // [numProperArguments, numProperArguments+numAttributions)
      ArrayRef<BlockArgument> attributionArguments =
          gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
                                         numAttributions);
      for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
               gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
        auto [attribution, arg] = vals;
        auto type = cast<MemRefType>(attribution.getType());

        // Arguments are of llvm.ptr type and attributions are of memref type:
        // we need to wrap them in memref descriptors.
        Value descr = MemRefDescriptor::fromStaticShape(
            rewriter, loc, *getTypeConverter(), type, arg);

        // And remap the arguments
        signatureConversion.remapInput(numProperArguments + idx, descr);
      }
    } else {
      for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
        auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
                                                  global.getAddrSpace());
        Value address = LLVM::AddressOfOp::create(rewriter, loc, ptrType,
                                                  global.getSymNameAttr());
        Value memory =
            LLVM::GEPOp::create(rewriter, loc, ptrType, global.getType(),
                                address, ArrayRef<LLVM::GEPArg>{0, 0});

        // Build a memref descriptor pointing to the buffer to plug with the
        // existing memref infrastructure. This may use more registers than
        // otherwise necessary given that memref sizes are fixed, but we can try
        // and canonicalize that away later.
        Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
        auto type = cast<MemRefType>(attribution.getType());
        Value descr = MemRefDescriptor::fromStaticShape(
            rewriter, loc, *getTypeConverter(), type, memory);
        signatureConversion.remapInput(numProperArguments + idx, descr);
      }
    }

    // Rewrite private memory attributions to alloca'ed buffers.
    unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
    auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
    for (const auto [idx, attribution] :
         llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
      auto type = cast<MemRefType>(attribution.getType());
      assert(type && type.hasStaticShape() && "unexpected type in attribution");

      // Explicitly drop memory space when lowering private memory
      // attributions since NVVM models it as `alloca`s in the default
      // memory space and does not support `alloca`s with addrspace(5).
      Type elementType = typeConverter->convertType(type.getElementType());
      auto ptrType =
          LLVM::LLVMPointerType::get(rewriter.getContext(), allocaAddrSpace);
      Value numElements = LLVM::ConstantOp::create(
          rewriter, gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
      uint64_t alignment = 0;
      if (auto alignAttr =
              dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getPrivateAttributionAttr(
                  idx, LLVM::LLVMDialect::getAlignAttrName())))
        alignment = alignAttr.getInt();
      Value allocated =
          LLVM::AllocaOp::create(rewriter, gpuFuncOp.getLoc(), ptrType,
                                 elementType, numElements, alignment);
      Value descr = MemRefDescriptor::fromStaticShape(
          rewriter, loc, *getTypeConverter(), type, allocated);
      signatureConversion.remapInput(
          numProperArguments + numWorkgroupAttributions + idx, descr);
    }
  }

  // Move the region to the new function, update the entry block signature.
  rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
                              llvmFuncOp.end());
  if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
                                         &signatureConversion)))
    return failure();

  // Get memref type from function arguments and set the noalias to
  // pointer arguments.
  for (const auto [idx, argTy] :
       llvm::enumerate(gpuFuncOp.getArgumentTypes())) {
    auto remapping = signatureConversion.getInputMapping(idx);
    NamedAttrList argAttr =
        argAttrs ? cast<DictionaryAttr>(argAttrs[idx]) : NamedAttrList();
    auto copyAttribute = [&](StringRef attrName) {
      Attribute attr = argAttr.erase(attrName);
      if (!attr)
        return;
      for (size_t i = 0, e = remapping->size; i < e; ++i)
        llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
    };
    auto copyPointerAttribute = [&](StringRef attrName) {
      Attribute attr = argAttr.erase(attrName);

      if (!attr)
        return;
      if (remapping->size > 1 &&
          attrName == LLVM::LLVMDialect::getNoAliasAttrName()) {
        emitWarning(llvmFuncOp.getLoc(),
                    "Cannot copy noalias with non-bare pointers.\n");
        return;
      }
      for (size_t i = 0, e = remapping->size; i < e; ++i) {
        if (isa<LLVM::LLVMPointerType>(
                llvmFuncOp.getArgument(remapping->inputNo + i).getType())) {
          llvmFuncOp.setArgAttr(remapping->inputNo + i, attrName, attr);
        }
      }
    };

    if (argAttr.empty())
      continue;

    copyAttribute(LLVM::LLVMDialect::getReturnedAttrName());
    copyAttribute(LLVM::LLVMDialect::getNoUndefAttrName());
    copyAttribute(LLVM::LLVMDialect::getInRegAttrName());
    bool lowersToPointer = false;
    for (size_t i = 0, e = remapping->size; i < e; ++i) {
      lowersToPointer |= isa<LLVM::LLVMPointerType>(
          llvmFuncOp.getArgument(remapping->inputNo + i).getType());
    }

    if (lowersToPointer) {
      copyPointerAttribute(LLVM::LLVMDialect::getNoAliasAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getNoCaptureAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getNoFreeAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getAlignAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getReadonlyAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getWriteOnlyAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getReadnoneAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getNonNullAttrName());
      copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
      copyPointerAttribute(
          LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
      copyPointerAttribute(
          LLVM::LLVMDialect::WorkgroupAttributionAttrHelper::getNameStr());
    }
  }
  rewriter.eraseOp(gpuFuncOp);
  return success();
}

LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
    gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
    ConversionPatternRewriter &rewriter) const {
  Location loc = gpuPrintfOp->getLoc();

  mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
  auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
  mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
  mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
  // This ensures that global constants and declarations are placed within
  // the device code, not the host code
  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();

  auto ocklBegin =
      getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin",
                          LLVM::LLVMFunctionType::get(llvmI64, {llvmI64}));
  LLVM::LLVMFuncOp ocklAppendArgs;
  if (!adaptor.getArgs().empty()) {
    ocklAppendArgs = getOrDefineFunction(
        moduleOp, loc, rewriter, "__ockl_printf_append_args",
        LLVM::LLVMFunctionType::get(
            llvmI64, {llvmI64, /*numArgs*/ llvmI32, llvmI64, llvmI64, llvmI64,
                      llvmI64, llvmI64, llvmI64, llvmI64, /*isLast*/ llvmI32}));
  }
  auto ocklAppendStringN = getOrDefineFunction(
      moduleOp, loc, rewriter, "__ockl_printf_append_string_n",
      LLVM::LLVMFunctionType::get(
          llvmI64,
          {llvmI64, ptrType, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32}));

  /// Start the printf hostcall
  Value zeroI64 = LLVM::ConstantOp::create(rewriter, loc, llvmI64, 0);
  auto printfBeginCall =
      LLVM::CallOp::create(rewriter, loc, ocklBegin, zeroI64);
  Value printfDesc = printfBeginCall.getResult();

  // Create the global op or find an existing one.
  LLVM::GlobalOp global = getOrCreateStringConstant(
      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat());

  // Get a pointer to the format string's first element and pass it to printf()
  Value globalPtr = LLVM::AddressOfOp::create(
      rewriter, loc,
      LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
      global.getSymNameAttr());
  Value stringStart =
      LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
                          globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
  Value stringLen = LLVM::ConstantOp::create(
      rewriter, loc, llvmI64, cast<StringAttr>(global.getValueAttr()).size());

  Value oneI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 1);
  Value zeroI32 = LLVM::ConstantOp::create(rewriter, loc, llvmI32, 0);

  auto appendFormatCall = LLVM::CallOp::create(
      rewriter, loc, ocklAppendStringN,
      ValueRange{printfDesc, stringStart, stringLen,
                 adaptor.getArgs().empty() ? oneI32 : zeroI32});
  printfDesc = appendFormatCall.getResult();

  // __ockl_printf_append_args takes 7 values per append call
  constexpr size_t argsPerAppend = 7;
  size_t nArgs = adaptor.getArgs().size();
  for (size_t group = 0; group < nArgs; group += argsPerAppend) {
    size_t bound = std::min(group + argsPerAppend, nArgs);
    size_t numArgsThisCall = bound - group;

    SmallVector<mlir::Value, 2 + argsPerAppend + 1> arguments;
    arguments.push_back(printfDesc);
    arguments.push_back(
        LLVM::ConstantOp::create(rewriter, loc, llvmI32, numArgsThisCall));
    for (size_t i = group; i < bound; ++i) {
      Value arg = adaptor.getArgs()[i];
      if (auto floatType = dyn_cast<FloatType>(arg.getType())) {
        if (!floatType.isF64())
          arg = LLVM::FPExtOp::create(
              rewriter, loc, typeConverter->convertType(rewriter.getF64Type()),
              arg);
        arg = LLVM::BitcastOp::create(rewriter, loc, llvmI64, arg);
      }
      if (arg.getType().getIntOrFloatBitWidth() != 64)
        arg = LLVM::ZExtOp::create(rewriter, loc, llvmI64, arg);

      arguments.push_back(arg);
    }
    // Pad out to 7 arguments since the hostcall always needs 7
    for (size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) {
      arguments.push_back(zeroI64);
    }

    auto isLast = (bound == nArgs) ? oneI32 : zeroI32;
    arguments.push_back(isLast);
    auto call = LLVM::CallOp::create(rewriter, loc, ocklAppendArgs, arguments);
    printfDesc = call.getResult();
  }
  rewriter.eraseOp(gpuPrintfOp);
  return success();
}

LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
    gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
    ConversionPatternRewriter &rewriter) const {
  Location loc = gpuPrintfOp->getLoc();

  mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
  mlir::Type ptrType =
      LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);

  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
  // This ensures that global constants and declarations are placed within
  // the device code, not the host code
  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();

  auto printfType =
      LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
                                  /*isVarArg=*/true);
  LLVM::LLVMFuncOp printfDecl =
      getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);

  // Create the global op or find an existing one.
  LLVM::GlobalOp global = getOrCreateStringConstant(
      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat(),
      /*alignment=*/0, addressSpace);

  // Get a pointer to the format string's first element
  Value globalPtr = LLVM::AddressOfOp::create(
      rewriter, loc,
      LLVM::LLVMPointerType::get(rewriter.getContext(), global.getAddrSpace()),
      global.getSymNameAttr());
  Value stringStart =
      LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
                          globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});

  // Construct arguments and function call
  auto argsRange = adaptor.getArgs();
  SmallVector<Value, 4> printfArgs;
  printfArgs.reserve(argsRange.size() + 1);
  printfArgs.push_back(stringStart);
  printfArgs.append(argsRange.begin(), argsRange.end());

  LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs);
  rewriter.eraseOp(gpuPrintfOp);
  return success();
}

LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
    gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
    ConversionPatternRewriter &rewriter) const {
  Location loc = gpuPrintfOp->getLoc();

  mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
  mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());

  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
  // This ensures that global constants and declarations are placed within
  // the device code, not the host code
  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();

  // Create a valid global location removing any metadata attached to the
  // location as debug info metadata inside of a function cannot be used outside
  // of that function.
  Location globalLoc = loc->findInstanceOfOrUnknown<FileLineColLoc>();

  auto vprintfType =
      LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType, ptrType});
  LLVM::LLVMFuncOp vprintfDecl = getOrDefineFunction(
      moduleOp, globalLoc, rewriter, "vprintf", vprintfType);

  // Create the global op or find an existing one.
  LLVM::GlobalOp global =
      getOrCreateStringConstant(rewriter, globalLoc, moduleOp, llvmI8,
                                "printfFormat_", adaptor.getFormat());

  // Get a pointer to the format string's first element
  Value globalPtr = LLVM::AddressOfOp::create(rewriter, loc, global);
  Value stringStart =
      LLVM::GEPOp::create(rewriter, loc, ptrType, global.getGlobalType(),
                          globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
  SmallVector<Type> types;
  SmallVector<Value> args;
  // Promote and pack the arguments into a stack allocation.
  for (Value arg : adaptor.getArgs()) {
    Type type = arg.getType();
    Value promotedArg = arg;
    assert(type.isIntOrFloat());
    if (isa<FloatType>(type)) {
      type = rewriter.getF64Type();
      promotedArg = LLVM::FPExtOp::create(rewriter, loc, type, arg);
    }
    types.push_back(type);
    args.push_back(promotedArg);
  }
  Type structType =
      LLVM::LLVMStructType::getLiteral(gpuPrintfOp.getContext(), types);
  Value one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(),
                                       rewriter.getIndexAttr(1));
  Value tempAlloc =
      LLVM::AllocaOp::create(rewriter, loc, ptrType, structType, one,
                             /*alignment=*/0);
  for (auto [index, arg] : llvm::enumerate(args)) {
    Value ptr = LLVM::GEPOp::create(
        rewriter, loc, ptrType, structType, tempAlloc,
        ArrayRef<LLVM::GEPArg>{0, static_cast<int32_t>(index)});
    LLVM::StoreOp::create(rewriter, loc, arg, ptr);
  }
  std::array<Value, 2> printfArgs = {stringStart, tempAlloc};

  LLVM::CallOp::create(rewriter, loc, vprintfDecl, printfArgs);
  rewriter.eraseOp(gpuPrintfOp);
  return success();
}

/// Helper for impl::scalarizeVectorOp. Scalarizes vectors to elements.
/// Used either directly (for ops on 1D vectors) or as the callback passed to
/// detail::handleMultidimensionalVectors (for ops on higher-rank vectors).
static Value scalarizeVectorOpHelper(Operation *op, ValueRange operands,
                                     Type llvm1DVectorTy,
                                     ConversionPatternRewriter &rewriter,
                                     const LLVMTypeConverter &converter) {
  TypeRange operandTypes(operands);
  VectorType vectorType = cast<VectorType>(llvm1DVectorTy);
  Location loc = op->getLoc();
  Value result = LLVM::PoisonOp::create(rewriter, loc, vectorType);
  Type indexType = converter.convertType(rewriter.getIndexType());
  StringAttr name = op->getName().getIdentifier();
  Type elementType = vectorType.getElementType();

  for (int64_t i = 0; i < vectorType.getNumElements(); ++i) {
    Value index = LLVM::ConstantOp::create(rewriter, loc, indexType, i);
    auto extractElement = [&](Value operand) -> Value {
      if (!isa<VectorType>(operand.getType()))
        return operand;
      return LLVM::ExtractElementOp::create(rewriter, loc, operand, index);
    };
    auto scalarOperands = llvm::map_to_vector(operands, extractElement);
    Operation *scalarOp =
        rewriter.create(loc, name, scalarOperands, elementType, op->getAttrs());
    result = LLVM::InsertElementOp::create(rewriter, loc, result,
                                           scalarOp->getResult(0), index);
  }
  return result;
}

/// Unrolls op to array/vector elements.
LogicalResult impl::scalarizeVectorOp(Operation *op, ValueRange operands,
                                      ConversionPatternRewriter &rewriter,
                                      const LLVMTypeConverter &converter) {
  TypeRange operandTypes(operands);
  if (llvm::any_of(operandTypes, llvm::IsaPred<VectorType>)) {
    VectorType vectorType =
        cast<VectorType>(converter.convertType(op->getResultTypes()[0]));
    rewriter.replaceOp(op, scalarizeVectorOpHelper(op, operands, vectorType,
                                                   rewriter, converter));
    return success();
  }

  if (llvm::any_of(operandTypes, llvm::IsaPred<LLVM::LLVMArrayType>)) {
    return LLVM::detail::handleMultidimensionalVectors(
        op, operands, converter,
        [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
          return scalarizeVectorOpHelper(op, operands, llvm1DVectorTy, rewriter,
                                         converter);
        },
        rewriter);
  }

  return rewriter.notifyMatchFailure(op, "no llvm.array or vector to unroll");
}

static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
  return IntegerAttr::get(IntegerType::get(ctx, 64), space);
}

/// Generates a symbol with 0-sized array type for dynamic shared memory usage,
/// or uses existing symbol.
LLVM::GlobalOp getDynamicSharedMemorySymbol(
    ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp,
    gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter,
    MemRefType memrefType, unsigned alignmentBit) {
  uint64_t alignmentByte = alignmentBit / memrefType.getElementTypeBitWidth();

  FailureOr<unsigned> addressSpace =
      typeConverter->getMemRefAddressSpace(memrefType);
  if (failed(addressSpace)) {
    op->emitError() << "conversion of memref memory space "
                    << memrefType.getMemorySpace()
                    << " to integer address space "
                       "failed. Consider adding memory space conversions.";
  }

  // Step 1. Collect symbol names of LLVM::GlobalOp Ops. Also if any of
  // LLVM::GlobalOp is suitable for shared memory, return it.
  llvm::StringSet<> existingGlobalNames;
  for (auto globalOp : moduleOp.getBody()->getOps<LLVM::GlobalOp>()) {
    existingGlobalNames.insert(globalOp.getSymName());
    if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(globalOp.getType())) {
      if (globalOp.getAddrSpace() == addressSpace.value() &&
          arrayType.getNumElements() == 0 &&
          globalOp.getAlignment().value_or(0) == alignmentByte) {
        return globalOp;
      }
    }
  }

  // Step 2. Find a unique symbol name
  unsigned uniquingCounter = 0;
  SmallString<128> symName = SymbolTable::generateSymbolName<128>(
      "__dynamic_shmem_",
      [&](StringRef candidate) {
        return existingGlobalNames.contains(candidate);
      },
      uniquingCounter);

  // Step 3. Generate a global op
  OpBuilder::InsertionGuard guard(rewriter);
  rewriter.setInsertionPointToStart(moduleOp.getBody());

  auto zeroSizedArrayType = LLVM::LLVMArrayType::get(
      typeConverter->convertType(memrefType.getElementType()), 0);

  return LLVM::GlobalOp::create(rewriter, op->getLoc(), zeroSizedArrayType,
                                /*isConstant=*/false, LLVM::Linkage::Internal,
                                symName, /*value=*/Attribute(), alignmentByte,
                                addressSpace.value());
}

LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite(
    gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor,
    ConversionPatternRewriter &rewriter) const {
  Location loc = op.getLoc();
  MemRefType memrefType = op.getResultMemref().getType();
  Type elementType = typeConverter->convertType(memrefType.getElementType());

  // Step 1: Generate a memref<0xi8> type
  MemRefLayoutAttrInterface layout = {};
  auto memrefType0sz =
      MemRefType::get({0}, elementType, layout, memrefType.getMemorySpace());

  // Step 2: Generate a global symbol or existing for the dynamic shared
  // memory with memref<0xi8> type
  auto moduleOp = op->getParentOfType<gpu::GPUModuleOp>();
  LLVM::GlobalOp shmemOp = getDynamicSharedMemorySymbol(
      rewriter, moduleOp, op, getTypeConverter(), memrefType0sz, alignmentBit);

  // Step 3. Get address of the global symbol
  OpBuilder::InsertionGuard guard(rewriter);
  rewriter.setInsertionPoint(op);
  auto basePtr = LLVM::AddressOfOp::create(rewriter, loc, shmemOp);
  Type baseType = basePtr->getResultTypes().front();

  // Step 4. Generate GEP using offsets
  SmallVector<LLVM::GEPArg> gepArgs = {0};
  Value shmemPtr = LLVM::GEPOp::create(rewriter, loc, baseType, elementType,
                                       basePtr, gepArgs);
  // Step 5. Create a memref descriptor
  SmallVector<Value> shape, strides;
  Value sizeBytes;
  getMemRefDescriptorSizes(loc, memrefType0sz, {}, rewriter, shape, strides,
                           sizeBytes);
  auto memRefDescriptor = this->createMemRefDescriptor(
      loc, memrefType0sz, shmemPtr, shmemPtr, shape, strides, rewriter);

  // Step 5. Replace the op with memref descriptor
  rewriter.replaceOp(op, {memRefDescriptor});
  return success();
}

LogicalResult GPUReturnOpLowering::matchAndRewrite(
    gpu::ReturnOp op, OpAdaptor adaptor,
    ConversionPatternRewriter &rewriter) const {
  Location loc = op.getLoc();
  unsigned numArguments = op.getNumOperands();
  SmallVector<Value, 4> updatedOperands;

  bool useBarePtrCallConv = getTypeConverter()->getOptions().useBarePtrCallConv;
  if (useBarePtrCallConv) {
    // For the bare-ptr calling convention, extract the aligned pointer to
    // be returned from the memref descriptor.
    for (auto it : llvm::zip(op->getOperands(), adaptor.getOperands())) {
      Type oldTy = std::get<0>(it).getType();
      Value newOperand = std::get<1>(it);
      if (isa<MemRefType>(oldTy) && getTypeConverter()->canConvertToBarePtr(
                                        cast<BaseMemRefType>(oldTy))) {
        MemRefDescriptor memrefDesc(newOperand);
        newOperand = memrefDesc.allocatedPtr(rewriter, loc);
      } else if (isa<UnrankedMemRefType>(oldTy)) {
        // Unranked memref is not supported in the bare pointer calling
        // convention.
        return failure();
      }
      updatedOperands.push_back(newOperand);
    }
  } else {
    updatedOperands = llvm::to_vector<4>(adaptor.getOperands());
    (void)copyUnrankedDescriptors(rewriter, loc, op.getOperands().getTypes(),
                                  updatedOperands,
                                  /*toDynamic=*/true);
  }

  // If ReturnOp has 0 or 1 operand, create it and return immediately.
  if (numArguments <= 1) {
    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(
        op, TypeRange(), updatedOperands, op->getAttrs());
    return success();
  }

  // Otherwise, we need to pack the arguments into an LLVM struct type before
  // returning.
  auto packedType = getTypeConverter()->packFunctionResults(
      op.getOperandTypes(), useBarePtrCallConv);
  if (!packedType) {
    return rewriter.notifyMatchFailure(op, "could not convert result types");
  }

  Value packed = LLVM::PoisonOp::create(rewriter, loc, packedType);
  for (auto [idx, operand] : llvm::enumerate(updatedOperands)) {
    packed = LLVM::InsertValueOp::create(rewriter, loc, packed, operand, idx);
  }
  rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, TypeRange(), packed,
                                              op->getAttrs());
  return success();
}

void mlir::populateGpuMemorySpaceAttributeConversions(
    TypeConverter &typeConverter, const MemorySpaceMapping &mapping) {
  typeConverter.addTypeAttributeConversion(
      [mapping](BaseMemRefType type, gpu::AddressSpaceAttr memorySpaceAttr) {
        gpu::AddressSpace memorySpace = memorySpaceAttr.getValue();
        unsigned addressSpace = mapping(memorySpace);
        return wrapNumericMemorySpace(memorySpaceAttr.getContext(),
                                      addressSpace);
      });
}