A problem during deploying algorithm in Jetson Nano using RPC

I have a problem during deploying algorithm in Jetson Nano using RPC + Autotvm. I have tuned the algorithm successfully, as follows:

import logging
import sys
import os
import numpy as np
import tvm
from tvm import autotvm
from tvm import relay
import tvm.relay.testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir
import tvm.contrib.graph_runtime as runtime
import topi

@autotvm.template
def matmul_cuda(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    C = topi.matmul(A, B)
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis

    ##### define space begin #####
    cfg = autotvm.get_config()

    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    
    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
    s[C].bind(yi, tvm.thread_axis("threadIdx.y"))
    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[C].bind(xi, tvm.thread_axis("threadIdx.x"))
    
    s[C].reorder(yo, xo, yi, xi)

    return s, [A, B, C]

tgt_cuda = 'cuda'
tgt_host="llvm -target=aarch64-linux-gnu" # for nano

N, L, M = 20, 10, 15
task = autotvm.task.create(matmul_cuda, args=(N, L, M, 'float32'), target= tgt_cuda, target_host=tgt_host)

logging.getLogger('autotvm').setLevel(logging.DEBUG)
logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
measure_option = autotvm.measure_option(
    builder= autotvm.LocalBuilder(), 
    runner=autotvm.RPCRunner('nano', '0.0.0.0', 9190, repeat=3, min_repeat_ms = 500, timeout =4)
        )

tuner = autotvm.tuner.XGBTuner(task)
tuner.tune(n_trial= 50, 
           measure_option=measure_option, 
           callbacks=[autotvm.callback.log_to_file('./tvm_output/matmul_cuda_nano.log')]
          ) 

# apply history best from log file
with autotvm.apply_history_best('./tvm_output/matmul_cuda_nano.log'):
    with tvm.target.create("cuda"): 
        s, arg_bufs = matmul_cuda(N, L, M, 'float32')
        func_cuda_nano = tvm.build(s, arg_bufs,target=tgt_cuda, target_host=tgt_host)

func_cuda_nano.export_library("./tvm_output/myfunc_nano_cr_cuda_autotvm.so", cc='/usr/bin/aarch64-linux-gnu-gcc')
Successed Tune:

No: 23 GFLOPS: 0.23/0.23 result: MeasureResult(costs=(2.6544782446752703e-05,), error_no=0, all_cost=27.74042797088623, timestamp=1566171425.8727539) [(‘tile_y’, [20, 1]), (‘tile_x’, [3, 5])],None,12

Bad Tune: No: 2 GFLOPS: 0.00/0.23 result: MeasureResult(costs=(RuntimeError(‘Traceback (most recent call last):\n [bt] (3) /home/lukuan/lk_git/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7fc170ff3691]\n [bt] (2) /home/lukuan/lk_git/tvm/build/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), tvm::runtime::RPCModuleNode::WrapRemote(void*)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x3b) [0x7fc17104befb]\n [bt] (1) /home/lukuan/lk_git/tvm/build/libtvm.so(tvm::runtime::RPCSession::CallFunc(void*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*, tvm::runtime::PackedFunc const*)+0x154) [0x7fc171030924]\n [bt] (0) /home/lukuan/lk_git/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7fc1708617f2]\n File “/home/lukuan/lk_git/tvm/src/runtime/rpc/rpc_session.cc”, line 962\nTVMError: Check failed: code == RPCCode: :kReturn: code=4’,),), error_no=4, all_cost=9.11729645729065, timestamp=1566171341.5919862) [(‘tile_y’, [1, 20]), (‘tile_x’, [5, 3])],None,11

But when it comes to upload the tunned &compiled file to the remote device and run the function:

func(a, b,c ) it has the error as follows:

TVMError: Except caught from RPC call: [08:04:21] /home/lly2014/lk_git/tvm/src/runtime/module_util.cc:73: Check failed: ret == 0 (-1 vs. 0) : CUDAError: cuModuleLoadData(&amp;(module_[device_id]), data_.c_str()) failed with error: CUDA_ERROR_INVALID_PTX

So, I am confused with that. Anyone managed to complete a whole RPC+Autotvm process? Thanks.