[MLIR] Split ExecutionEngine Initialization out of ctor into an explicit method call (#153524)

Retry landing https://github.com/llvm/llvm-project/pull/153373 ## Major changes from previous attempt - remove the test in CAPI because no existing tests in CAPI deal with sanitizer exemptions - update `mlir/docs/Dialects/GPU.md` to reflect the new behavior: load GPU binary in global ctors, instead of loading them at call site. - skip the test on Aarch64 since we have an issue with initialization there --------- Co-authored-by: Mehdi Amini <joker.eph@gmail.com>
2025-08-18 05:07:24 +08:00 · 2025-08-18 05:07:24 +08:00 · 7610b13729
commit 7610b13729
parent 40833eea21
14 changed files with 310 additions and 13 deletions
--- a/mlir/docs/Dialects/GPU.md
+++ b/mlir/docs/Dialects/GPU.md
@ -193,10 +193,25 @@ llvm.func @foo() {
 // mlir-translate --mlir-to-llvmir:
@binary_bin_cst = internal constant [6 x i8] c"AMDGPU", align 8
@binary_func_kernel_name = private unnamed_addr constant [7 x i8] c"func\00", align 1
+@binary_module = internal global ptr null
+@llvm.global_ctors = appending global [1 x {i32, ptr, ptr}] [{i32 123, ptr @binary_load, ptr null}]
+@llvm.global_dtors = appending global [1 x {i32, ptr, ptr}] [{i32 123, ptr @binary_unload, ptr null}]
+define internal void @binary_load() section ".text.startup" {
+entry:
+  %0 = call ptr @mgpuModuleLoad(ptr @binary_bin_cst)
+  store ptr %0, ptr @binary_module
+  ...
+}
+define internal void @binary_unload() section ".text.startup" {
+entry:
+  %0 = load ptr, ptr @binary_module, align 8
+  call void @mgpuModuleUnload(ptr %0)
+  ...
+}
 ...
 define void @foo() {
  ...
-  %module = call ptr @mgpuModuleLoad(ptr @binary_bin_cst)
+  %module = load ptr, ptr @binary_module, align 8
  %kernel = call ptr @mgpuModuleGetFunction(ptr %module, ptr @binary_func_kernel_name)
  call void @mgpuLaunchKernel(ptr %kernel, ...) ; Launch the kernel
  ...
--- a/mlir/include/mlir-c/ExecutionEngine.h
+++ b/mlir/include/mlir-c/ExecutionEngine.h
@ -46,6 +46,13 @@ MLIR_CAPI_EXPORTED MlirExecutionEngine mlirExecutionEngineCreate(
    MlirModule op, int optLevel, int numPaths,
    const MlirStringRef *sharedLibPaths, bool enableObjectDump);

+/// Initialize the ExecutionEngine. Global constructors specified by
+/// `llvm.mlir.global_ctors` will be run. One common scenario is that kernel
+/// binary compiled from `gpu.module` gets loaded during initialization. Make
+/// sure all symbols are resolvable before initialization by calling
+/// `mlirExecutionEngineRegisterSymbol` or including shared libraries.
+MLIR_CAPI_EXPORTED void mlirExecutionEngineInitialize(MlirExecutionEngine jit);
+
 /// Destroy an ExecutionEngine instance.
 MLIR_CAPI_EXPORTED void mlirExecutionEngineDestroy(MlirExecutionEngine jit);

--- a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
+++ b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h
@ -227,6 +227,13 @@ public:
      llvm::function_ref<llvm::orc::SymbolMap(llvm::orc::MangleAndInterner)>
          symbolMap);

+  /// Initialize the ExecutionEngine. Global constructors specified by
+  /// `llvm.mlir.global_ctors` will be run. One common scenario is that kernel
+  /// binary compiled from `gpu.module` gets loaded during initialization. Make
+  /// sure all symbols are resolvable before initialization by calling
+  /// `registerSymbols` or including shared libraries.
+  void initialize();
+
 private:
  /// Ordering of llvmContext and jit is important for destruction purposes: the
  /// jit must be destroyed before the context.
@ -250,6 +257,8 @@ private:
  /// Destroy functions in the libraries loaded by the ExecutionEngine that are
  /// called when this ExecutionEngine is destructed.
  SmallVector<LibraryDestroyFn> destroyFns;
+
+  bool isInitialized = false;
 };

 } // namespace mlir
--- a/mlir/lib/Bindings/Python/ExecutionEngineModule.cpp
+++ b/mlir/lib/Bindings/Python/ExecutionEngineModule.cpp
@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//

 #include "mlir-c/ExecutionEngine.h"
-#include "mlir/Bindings/Python/NanobindAdaptors.h"
 #include "mlir/Bindings/Python/Nanobind.h"
+#include "mlir/Bindings/Python/NanobindAdaptors.h"

 namespace nb = nanobind;
 using namespace mlir;
@ -124,6 +124,17 @@ NB_MODULE(_mlirExecutionEngine, m) {
          },
          nb::arg("name"), nb::arg("callback"),
          "Register `callback` as the runtime symbol `name`.")
+      .def(
+          "initialize",
+          [](PyExecutionEngine &executionEngine) {
+            mlirExecutionEngineInitialize(executionEngine.get());
+          },
+          "Initialize the ExecutionEngine. Global constructors specified by "
+          "`llvm.mlir.global_ctors` will be run. One common scenario is that "
+          "kernel binary compiled from `gpu.module` gets loaded during "
+          "initialization. Make sure all symbols are resolvable before "
+          "initialization by calling `register_runtime` or including "
+          "shared libraries.")
      .def(
          "dump_to_object_file",
          [](PyExecutionEngine &executionEngine, const std::string &fileName) {
--- a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp
@ -68,6 +68,10 @@ mlirExecutionEngineCreate(MlirModule op, int optLevel, int numPaths,
  return wrap(jitOrError->release());
 }

+extern "C" void mlirExecutionEngineInitialize(MlirExecutionEngine jit) {
+  unwrap(jit)->initialize();
+}
+
 extern "C" void mlirExecutionEngineDestroy(MlirExecutionEngine jit) {
  delete (unwrap(jit));
 }
@ -106,9 +110,8 @@ extern "C" void mlirExecutionEngineRegisterSymbol(MlirExecutionEngine jit,
                                                  void *sym) {
  unwrap(jit)->registerSymbols([&](llvm::orc::MangleAndInterner interner) {
    llvm::orc::SymbolMap symbolMap;
-    symbolMap[interner(unwrap(name))] =
-        { llvm::orc::ExecutorAddr::fromPtr(sym),
-          llvm::JITSymbolFlags::Exported };
+    symbolMap[interner(unwrap(name))] = {llvm::orc::ExecutorAddr::fromPtr(sym),
+                                         llvm::JITSymbolFlags::Exported};
    return symbolMap;
  });
 }
--- a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@ -400,13 +400,6 @@ ExecutionEngine::create(Operation *m, const ExecutionEngineOptions &options,
    return symbolMap;
  };
  engine->registerSymbols(runtimeSymbolMap);
-
-  // Execute the global constructors from the module being processed.
-  // TODO: Allow JIT initialize for AArch64. Currently there's a bug causing a
-  // crash for AArch64 see related issue #71963.
-  if (!engine->jit->getTargetTriple().isAArch64())
-    cantFail(engine->jit->initialize(engine->jit->getMainJITDylib()));
-
  return std::move(engine);
 }

@ -442,6 +435,7 @@ Expected<void *> ExecutionEngine::lookup(StringRef name) const {

 Error ExecutionEngine::invokePacked(StringRef name,
                                    MutableArrayRef<void *> args) {
+  initialize();
  auto expectedFPtr = lookupPacked(name);
  if (!expectedFPtr)
    return expectedFPtr.takeError();
@ -451,3 +445,13 @@ Error ExecutionEngine::invokePacked(StringRef name,

  return Error::success();
 }
+
+void ExecutionEngine::initialize() {
+  if (isInitialized)
+    return;
+  // TODO: Allow JIT initialize for AArch64. Currently there's a bug causing a
+  // crash for AArch64 see related issue #71963.
+  if (!jit->getTargetTriple().isAArch64())
+    cantFail(jit->initialize(jit->getMainJITDylib()));
+  isInitialized = true;
+}
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@ -202,6 +202,8 @@ compileAndExecute(Options &options, Operation *module, StringRef entryPoint,

  auto engine = std::move(*expectedEngine);

+  engine->initialize();
+
  auto expectedFPtr = engine->lookupPacked(entryPoint);
  if (!expectedFPtr)
    return expectedFPtr.takeError();
--- a/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi
@ -19,5 +19,6 @@ class ExecutionEngine:
    def dump_to_object_file(self, file_name: str) -> None: ...
    def raw_lookup(self, func_name: str) -> int: ...
    def raw_register_runtime(self, name: str, callback: object) -> None: ...
+    def init() -> None: ...
    @property
    def _CAPIPtr(self) -> object: ...
--- a/mlir/test/CAPI/CMakeLists.txt
+++ b/mlir/test/CAPI/CMakeLists.txt
@ -30,6 +30,13 @@ if(MLIR_ENABLE_EXECUTION_ENGINE)
    MLIRCAPIConversion
    MLIRCAPIExecutionEngine
    MLIRCAPIRegisterEverything
+)
+  _add_capi_test_executable(mlir-capi-global-constructors-test
+    global_constructors.c
+  LINK_LIBS PRIVATE
+    MLIRCAPIConversion
+    MLIRCAPIExecutionEngine
+    MLIRCAPIRegisterEverything
 )
 endif()

--- a/mlir/test/CAPI/global_constructors.c
+++ b/mlir/test/CAPI/global_constructors.c
@ -0,0 +1,113 @@
+//===- global_constructors.c - Test JIT with the global constructors ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: target=aarch64{{.*}}, target=arm64{{.*}}
+/* RUN: mlir-capi-global-constructors-test 2>&1 | FileCheck %s
+ */
+/* REQUIRES: host-supports-jit
+ */
+
+#include "mlir-c/Conversion.h"
+#include "mlir-c/ExecutionEngine.h"
+#include "mlir-c/IR.h"
+#include "mlir-c/RegisterEverything.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void registerAllUpstreamDialects(MlirContext ctx) {
+  MlirDialectRegistry registry = mlirDialectRegistryCreate();
+  mlirRegisterAllDialects(registry);
+  mlirContextAppendDialectRegistry(ctx, registry);
+  mlirDialectRegistryDestroy(registry);
+}
+
+void lowerModuleToLLVM(MlirContext ctx, MlirModule module) {
+  MlirPassManager pm = mlirPassManagerCreate(ctx);
+  MlirOpPassManager opm = mlirPassManagerGetNestedUnder(
+      pm, mlirStringRefCreateFromCString("func.func"));
+  mlirPassManagerAddOwnedPass(pm, mlirCreateConversionConvertFuncToLLVMPass());
+  mlirOpPassManagerAddOwnedPass(
+      opm, mlirCreateConversionArithToLLVMConversionPass());
+  MlirLogicalResult status =
+      mlirPassManagerRunOnOp(pm, mlirModuleGetOperation(module));
+  if (mlirLogicalResultIsFailure(status)) {
+    fprintf(stderr, "Unexpected failure running pass pipeline\n");
+    exit(2);
+  }
+  mlirPassManagerDestroy(pm);
+}
+
+// Helper variable to track callback invocations
+static int initCnt = 0;
+
+// Callback function that will be called during JIT initialization
+static void initCallback(void) { initCnt += 1; }
+
+// CHECK-LABEL: Running test 'testGlobalCtorJitCallback'
+void testGlobalCtorJitCallback(void) {
+  MlirContext ctx = mlirContextCreate();
+  registerAllUpstreamDialects(ctx);
+
+  // Create module with global constructor that calls our callback
+  MlirModule module = mlirModuleCreateParse(
+      ctx, mlirStringRefCreateFromCString(
+               // clang-format off
+"module {                                                                       \n"
+"  llvm.mlir.global_ctors ctors = [@ctor], priorities = [0 : i32], data = [#llvm.zero] \n"
+"  llvm.func @ctor() {                                                          \n"
+"    func.call @init_callback() : () -> ()                                      \n"
+"    llvm.return                                                                \n"
+"  }                                                                            \n"
+"  func.func private @init_callback() attributes { llvm.emit_c_interface }      \n"
+"}                                                                              \n"
+               // clang-format on
+               ));
+
+  lowerModuleToLLVM(ctx, module);
+  mlirRegisterAllLLVMTranslations(ctx);
+
+  // Create execution engine with initialization disabled
+  MlirExecutionEngine jit = mlirExecutionEngineCreate(
+      module, /*optLevel=*/2, /*numPaths=*/0, /*sharedLibPaths=*/NULL,
+      /*enableObjectDump=*/false);
+
+  if (mlirExecutionEngineIsNull(jit)) {
+    fprintf(stderr, "Execution engine creation failed");
+    exit(2);
+  }
+
+  // Register callback symbol before initialization
+  mlirExecutionEngineRegisterSymbol(
+      jit, mlirStringRefCreateFromCString("_mlir_ciface_init_callback"),
+      (void *)(uintptr_t)initCallback);
+
+  mlirExecutionEngineInitialize(jit);
+
+  // CHECK: Init count: 1
+  printf("Init count: %d\n", initCnt);
+
+  mlirExecutionEngineDestroy(jit);
+  mlirModuleDestroy(module);
+  mlirContextDestroy(ctx);
+}
+
+int main(void) {
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+#define TEST(test)                                                             \
+  printf("Running test '" STRINGIFY(test) "'\n");                              \
+  test();
+  TEST(testGlobalCtorJitCallback);
+  return 0;
+}
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@ -141,6 +141,7 @@ if(LLVM_ENABLE_PIC AND TARGET ${LLVM_NATIVE_ARCH})
    llc
    mlir_async_runtime
    mlir-capi-execution-engine-test
+    mlir-capi-global-constructors-test
    mlir_c_runner_utils
    mlir_runner_utils
    mlir_float16_utils
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@ -190,6 +190,7 @@ tools = [
    "mlir-translate",
    "mlir-lsp-server",
    "mlir-capi-execution-engine-test",
+    "mlir-capi-global-constructors-test",
    "mlir-capi-ir-test",
    "mlir-capi-irdl-test",
    "mlir-capi-llvm-test",
--- a/mlir/test/python/global_constructors.py
+++ b/mlir/test/python/global_constructors.py
@ -0,0 +1,72 @@
+# UNSUPPORTED: target=aarch64{{.*}}, target=arm64{{.*}}
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+# REQUIRES: host-supports-jit
+import gc, sys, os, tempfile
+from mlir.ir import *
+from mlir.passmanager import *
+from mlir.execution_engine import *
+from mlir.runtime import *
+
+
+# Log everything to stderr and flush so that we have a unified stream to match
+# errors/info emitted by MLIR to stderr.
+def log(*args):
+    print(*args, file=sys.stderr)
+    sys.stderr.flush()
+
+
+def run(f):
+    log("\nTEST:", f.__name__)
+    f()
+    gc.collect()
+    assert Context._get_live_count() == 0
+
+
+def lowerToLLVM(module):
+    pm = PassManager.parse(
+        "builtin.module(convert-func-to-llvm,reconcile-unrealized-casts)"
+    )
+    pm.run(module.operation)
+    return module
+
+
+# Test JIT callback in global constructor
+# CHECK-LABEL: TEST: testJITCallbackInGlobalCtor
+def testJITCallbackInGlobalCtor():
+    init_cnt = 0
+
+    @ctypes.CFUNCTYPE(None)
+    def initCallback():
+        nonlocal init_cnt
+        init_cnt += 1
+
+    with Context():
+        module = Module.parse(
+            r"""
+llvm.mlir.global_ctors ctors = [@ctor], priorities = [0 : i32], data = [#llvm.zero]
+llvm.func @ctor() {
+  func.call @init_callback() : () -> ()
+  llvm.return
+}
+func.func private @init_callback() attributes { llvm.emit_c_interface }
+        """
+        )
+
+        # Setup execution engine
+        execution_engine = ExecutionEngine(lowerToLLVM(module))
+
+        # Validate initialization hasn't run yet
+        assert init_cnt == 0
+
+        # # Register callback
+        execution_engine.register_runtime("init_callback", initCallback)
+
+        # # Initialize and verify
+        execution_engine.initialize()
+        assert init_cnt == 1
+        # # Second initialization should be no-op
+        execution_engine.initialize()
+        assert init_cnt == 1
+
+
+run(testJITCallbackInGlobalCtor)
--- a/mlir/unittests/ExecutionEngine/Invoke.cpp
+++ b/mlir/unittests/ExecutionEngine/Invoke.cpp
@ -322,4 +322,55 @@ TEST(NativeMemRefJit, MAYBE_JITCallback) {
    ASSERT_EQ(elt, coefficient * count++);
 }

+static int initCnt = 0;
+// A helper function that will be called during the JIT's initialization.
+static void initCallback() { initCnt += 1; }
+
+TEST(MLIRExecutionEngine, MAYBE_JITCallbackInGlobalCtor) {
+  auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+  ASSERT_TRUE(!!tmBuilderOrError);
+  if (tmBuilderOrError->getTargetTriple().isAArch64()) {
+    GTEST_SKIP() << "Skipping global ctor initialization test on Aarch64 "
+                    "because of bug #71963";
+    return;
+  }
+  std::string moduleStr = R"mlir(
+  llvm.mlir.global_ctors ctors = [@ctor], priorities = [0 : i32], data = [#llvm.zero]
+  llvm.func @ctor() {
+    func.call @init_callback() : () -> ()
+    llvm.return
+  }
+  func.func private @init_callback() attributes { llvm.emit_c_interface }
+  )mlir";
+
+  DialectRegistry registry;
+  registerAllDialects(registry);
+  registerBuiltinDialectTranslation(registry);
+  registerLLVMDialectTranslation(registry);
+  MLIRContext context(registry);
+  auto module = parseSourceString<ModuleOp>(moduleStr, &context);
+  ASSERT_TRUE(!!module);
+  ASSERT_TRUE(succeeded(lowerToLLVMDialect(*module)));
+  ExecutionEngineOptions jitOptions;
+  auto jitOrError = ExecutionEngine::create(*module, jitOptions);
+  ASSERT_TRUE(!!jitOrError);
+  // validate initialization is not run on construction
+  ASSERT_EQ(initCnt, 0);
+  auto jit = std::move(jitOrError.get());
+  // Define any extra symbols so they're available at initialization.
+  jit->registerSymbols([&](llvm::orc::MangleAndInterner interner) {
+    llvm::orc::SymbolMap symbolMap;
+    symbolMap[interner("_mlir_ciface_init_callback")] = {
+        llvm::orc::ExecutorAddr::fromPtr(initCallback),
+        llvm::JITSymbolFlags::Exported};
+    return symbolMap;
+  });
+  jit->initialize();
+  // validate the side effect of initialization
+  ASSERT_EQ(initCnt, 1);
+  // next initialization should be noop
+  jit->initialize();
+  ASSERT_EQ(initCnt, 1);
+}
+
 #endif // _WIN32