[Error] Skipped because of invalid gpu kernel on NV GPU

Hi,

When i am using TVM to tune the model(Conv-based) on NV GPU, i found there is always “Skipped because of invalid gpu kernel” error message shown. The generated gpu kernel is always invalid. I wonder if my usage is wrong. Here is the error message and the code. Could someone help take a look?

Error message:

[tt] autotvm i =  1920 . k =  37
[tt] autotvm tuner tune, flops =  0 . best_flops =  0
DEBUG:autotvm:No: 1958  GFLOPS: 0.00/0.00   result: Traceback (most recent call last):
  File "/home/dimitrov/tvm/python/tvm/autotvm/measure/measure_methods.py", line 567, in __call__
    func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
  File "/home/dimitrov/tvm/python/tvm/autotvm/measure/measure_methods.py", line 519, in _build_func_common
    func = build(s, args, target=target, runtime=runtime)
  File "/home/dimitrov/tvm/python/tvm/driver/build_module.py", line 235, in build
    input_mod = lower(inputs, args, name=name, binds=binds)
  File "/home/pt-gpu/dimitrov/tvm/python/tvm/driver/build_module.py", line 142, in lower
    return ffi.lower_schedule(inp, args, name, binds, simple_mode)
  File "/home/dimitrov/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 239, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
  10: TVMFuncCall
  9: _ZN3tvm7runtime13PackedFun
  8: tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, tvm::runtime::String const&, tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer, void, void> const&, bool)>::AssignTypedLambda<tvm::{lambda(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, tvm::runtime::String const&, tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer, void, void> const&, bool)#5}>(tvm::{lambda(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, tvm::runtime::String const&, tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer, void, void> const&, bool)#5}, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const, tvm::runtime::TVMRetValue) const
  7: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
  6: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
  5: tvm::transform::Pass::operator()(tvm::IRModule) const
  4: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  3: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  2: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  1: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  0: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<TVMFuncCreateFromCFunc::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#2}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  File "/home/dimitrov/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 83, in cfun
    rv = local_pyfunc(*pyargs)
  File "/home/dimitrov/tvm/python/tvm/autotvm/measure/measure_methods.py", line 845, in verify_pass
    raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel

tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel

Here is the Code:

builder = autotvm.LocalBuilder(build_func="default")
runner = autotvm.LocalRunner(
    number=number,
    repeat=repeat,
    timeout=10,
    min_repeat_ms=100,
    enable_cpu_cache_flush=True,
)

tuning_option = {
    "tuner": "xgb",
    "trials": 2000,
    "early_stopping": None,
    "measure_option": autotvm.measure_option(
        builder=builder, runner=runner
    ),
    "tuning_records": tune_record_file,
}

tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)

for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
    tuner_obj = XGBTuner(task, loss_type="rank")
    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )

with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        lib = relay.build(mod, target="cuda", params=params)

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

Thank you.