[Heterogeneous Execution] Segmentation Fault while running on CPU and GPU

I’m new to tvm. I tried to do heterogeneous execution on cpu and gpu. And I found some examples in /tvm/tests/python/relay/test_pass_annotation.py. I tried to build the annotated mod and run it on cpu and gpu. But some strange errors occured.
Here is my code .My tvm version is 0.8.

import tvm
from tvm import relay
from tvm.relay import transform

def run_opt_pass(expr, passes):
    passes = passes if isinstance(passes, list) else [passes]
    mod = tvm.IRModule.from_expr(expr)
    seq = tvm.transform.Sequential(passes)
    with tvm.transform.PassContext(opt_level=3):
        mod = seq(mod)
    return mod["main"]

def test_annotate_all():
    ctx1 = tvm.context(1) # cpu
    ctx2 = tvm.context(2) # gpu
    x = relay.var("x", shape=(3,))
    y = relay.var("y", shape=(3,))
    z = relay.var("z", shape=(3,))
    target = {'cpu':'llvm', 'cuda':'cuda'}
    
    def annotated():
        add = relay.add(x, y)
        _add = relay.annotation.on_device(add, ctx1)
        sub = relay.subtract(_add, z)
        _sub = relay.annotation.on_device(sub, ctx2)
        func = relay.Function([x, y, z], _sub)
        func = run_opt_pass(func, transform.RewriteAnnotatedOps(ctx1.device_type))
        return func
    
    def expected():
        add = relay.add(x, y)
        sub = relay.subtract(add, z)
        func = relay.Function([x, y, z], sub)
        return func
    
    annotated_func = annotated()
    mod = tvm.IRModule.from_expr(annotated_func)

    amod, _ = relay.optimize(mod, target=target)
    params = {}

    with relay.build_config(opt_level=1):
        graph, lib, params = relay.build(mod, target, params = params)
    
    contexts = [tvm.cpu(0), tvm.context("cuda")]
    mod = tvm.contrib.graph_runtime.create(graph, lib, contexts)

    mod.set_input("x", [1,2,3])
    mod.set_input("y", [4,5,6])
    mod.set_input("z", [2,2,2])
    mod.run()
    result = mod.get_output(0).asnumpy()
    print(result)
    
test_annotate_all()

If I set the two ops on same device (both on cpu or both on gpu), the code can get the correct result [3. 5. 7.].
If I set add on cpu and subtract on gpu as follow, Segmentation Fault will happen.

_add = relay.annotation.on_device(add, ctx1)
_sub = relay.annotation.on_device(sub, ctx2)


If I set add on gpu and subtract on cpu as follow, the program will abort. Here is the traceback

Traceback (most recent call last):
  File "annotated.py", line 75, in <module>
    test_annotate_all()
  File "annotated.py", line 68, in test_annotate_all
    mod.run()
  File "/opt/tvm/python/tvm/contrib/graph_runtime.py", line 207, in run
    self._run()
  File "/opt/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (6) /usr/lib64/libtvm.so(TVMFuncCall+0x48) [0x7f4030393298]
  [bt] (5) /usr/lib64/libtvm.so(tvm::runtime::GraphRuntime::Run()+0x37) [0x7f403041c527]
  [bt] (4) /usr/lib64/libtvm.so(+0x15b4ef5) [0x7f403041def5]
  [bt] (3) /usr/lib64/libtvm.so(TVMArrayCopyFromTo+0xa) [0x7f40303b2e5a]
  [bt] (2) /usr/lib64/libtvm.so(tvm::runtime::NDArray::CopyFromTo(DLTensor const*, DLTensor*, void*)+0x1f5) [0x7f40303b2d35]
  [bt] (1) /usr/lib64/libtvm.so(tvm::runtime::CUDADeviceAPI::CopyDataFromTo(void const*, unsigned long, void*, unsigned long, unsigned long, DLContext, DLContext, DLDataType, void*)+0xa3) [0x7f4030440e73]
  [bt] (0) /usr/lib64/libtvm.so(+0x15d6ac5) [0x7f403043fac5]
  File "/sources/tvm/src/runtime/cuda/cuda_device_api.cc", line 226
  File "/sources/tvm/src/runtime/graph/graph_runtime.cc", line 412
CUDA: Check failed: ret == 0 (-1 vs. 0) : Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: an illegal memory access was encountered
terminate called after throwing an instance of 'dmlc::Error'
  what():  [22:20:56] /sources/tvm/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_ILLEGAL_ADDRESS
Stack trace:
  [bt] (0) /usr/lib64/libtvm.so(+0x15db32a) [0x7f403044432a]
  [bt] (1) /usr/lib64/libtvm.so(tvm::runtime::CUDAModuleNode::~CUDAModuleNode()+0xd1) [0x7f4030446cd1]
  [bt] (2) /usr/lib64/libtvm.so(tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*)+0x19) [0x7f4030446e79]
  [bt] (3) /usr/lib64/libtvm.so(tvm::codegen::LLVMModuleNode::~LLVMModuleNode()+0x223) [0x7f4030374ef3]
  [bt] (4) /usr/lib64/libtvm.so(tvm::runtime::SimpleObjAllocator::Handler<tvm::codegen::LLVMModuleNode>::Deleter_(tvm::runtime::Object*)+0x19) [0x7f4030374f69]
  [bt] (5) /usr/lib64/libtvm.so(TVMObjectFree+0x20) [0x7f40303b5dd0]
  [bt] (6) /usr/lib64/libffi.so.6(ffi_call_unix64+0x4c) [0x7f420eb72dcc]
  [bt] (7) /usr/lib64/libffi.so.6(ffi_call+0x1f5) [0x7f420eb726f5]
  [bt] (8) /usr/lib64/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2a0) [0x7f420ed85600]


Aborted (core dumped)


Is there something wrong with my code? or is this just a bug?

We have met the similar problem … @zhiics