Check failed: (f != nullptr) is false, Cannot find function when building 2 systemlibs with CUDA

I built two systemlib models with CUDA and got

terminate called after throwing an instance of 'tvm::runtime::InternalError'
  what():  [11:58:15] ../../src/runtime/library_module.cc:85: InternalError: Check failed: ret == 0 (-1 vs. 0) : InternalError: Check failed: (f != nullptr) is false: Cannot find function tvmgen_model2_fused_add_rsqrt_multiply_kernel in the imported modules or global registry. If this involves ops from a contrib library like cuDNN, ensure TVM was built with the relevant library.
Stack trace:
  File "../../src/runtime/module.cc", line 119
...

Stack trace:
...
  [bt] (7) /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f02fd5dc083]
  [bt] (8) ./inference(+0x154de) [0x5629d37ad4de]


[1]    28786 abort (core dumped)  ./inference

When building with 1 model with CUDA, it works fine. With 2 systemlib models, it works fine when building with llvm. Please find the code snippets here

Python

def build_module():
    dshape = (1, 3, 224, 224)
    from mxnet.gluon.model_zoo.vision import get_model

    block = get_model("mobilenet0.25", pretrained=True)
    shape_dict = {"data": dshape}
    mod, params = relay.frontend.from_mxnet(block, shape_dict)
    func = mod["main"]
    func = relay.Function(
        func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs
    )

    models = [
        ("model1", Runtime("cpp", {"system-lib": True})),
        ("model2", Runtime("cpp", {"system-lib": True})),
    ]

    for name, runtime in models:
        with tvm.transform.PassContext(opt_level=3):
            graph, lib, params = relay.build(func, "cuda", runtime=runtime, params=params, mod_name=name)

        build_dir = os.path.abspath("build")
        if not os.path.isdir(build_dir):
            os.makedirs(build_dir)
        lib_file_name = os.path.join(build_dir, f"{name}.tar")
        lib.export_library(lib_file_name)
        with open(os.path.join(build_dir, f"{name}.json"), "w") as f_graph_json:
            f_graph_json.write(graph)
        with open(os.path.join(build_dir, f"{name}.params"), "wb") as f_params:
            f_params.write(tvm_runtime.save_param_dict(params))

C++

void RunModel1() {
  LOG(INFO) << "Running graph executor1...";

  std::string json1 = json_data("./build/model1.json");

  tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("runtime.SystemLib"))();
  // create the graph executor module
  int dev_type = kDLCUDA;
  int dev_id = 0;
  tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_executor.create"))(
      json1, mod_syslib, dev_type, dev_id);

  tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
  tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
  tvm::runtime::PackedFunc run = mod.GetFunction("run");

  // Use the C++ API
  DLDevice dev{kDLCUDA, 0};
  tvm::runtime::NDArray x =
      tvm::runtime::NDArray::Empty({3, 224, 224}, DLDataType{kDLFloat, 32, 1}, dev);

  // set the right input
  set_input("data", x);
  // run the code
  run();
}

void RunModel2() {
  LOG(INFO) << "Running graph executor2...";

  std::string json1 = json_data("./build/model2.json");

  tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("runtime.SystemLib"))();
  // create the graph executor module
  int dev_type = kDLCUDA;
  int dev_id = 0;
  tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_executor.create"))(
      json1, mod_syslib, dev_type, dev_id);

  tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
  tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
  tvm::runtime::PackedFunc run = mod.GetFunction("run");

  // Use the C++ API
  DLDevice dev{kDLCUDA, 0};
  tvm::runtime::NDArray x =
      tvm::runtime::NDArray::Empty({3, 224, 224}, DLDataType{kDLFloat, 32, 1}, dev);

  // set the right input
  set_input("data", x);
  // run the code
  run();
  // get the output
  // get_output(0, y);
}

int main(void) {
  RunModel1();
  RunModel2();
}

Makefile

TVM_ROOT=$(shell cd ../..; pwd)
DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
CUDA_HOME=/usr/local/cuda

PKG_CFLAGS = -std=c++17 -O2 -fPIC\
	-I${TVM_ROOT}/include\
	-I${DMLC_CORE}/include\
	-I${TVM_ROOT}/3rdparty/dlpack/include\
	-I/usr/local/cuda/include\
	-DDMLC_USE_LOGGING_LIBRARY=\<tvm/runtime/logging.h\>

PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -pthread

tvm_runtime_pack.o: tvm_runtime_pack.cc
	$(CXX) -c $(PKG_CFLAGS) -o $@  $^

deploy: tvm_runtime_pack.o
	$(CXX) $(PKG_CFLAGS) -Wl,--allow-multiple-definition -o inference inference.cc tvm_runtime_pack.o build/model1/lib0.o build/model1/devc.o build/model2/lib0.o build/model2/devc.o $(PKG_LDFLAGS) -L/usr/local/cuda/lib64 -lcuda -lcudart

Note that

  • I already link lib0.o and devc.o of both models to the binary.