Auto Tuning with Android, Vulkan is not working

Hi, I am trying to optimize the model further using Auto tuning feature on Android device for vulkan backend. But the auto tuning script is giving me below error:

Task: Task(func_name=conv2d_nchw.cuda, args=((‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’), kwargs={}, workload=(‘conv2d_nchw.cuda’, (‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’)) [Task 1/33] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (50/50) | 44.78 sWARNING:root:Could not find any valid schedule for task Task(func_name=conv2d_nchw.cuda, args=((‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’), kwargs={}, workload=(‘conv2d_nchw.cuda’, (‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’)). A file containing the errors has been written to /tmp/tvm_tuning_errors_57uwldt1.log. Done.

Below errors are present in error log:

TVMError: Let statement of variable conv2d_nchw is missing a type annotation, or type annotation is not a pointer to primitive

raise InstantiationError(“Skipped because of invalid gpu kernel”) tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel.

Script:

is_tuning = True tune_log = “auto_vulkan.log”

model_path = “dummy_model.onnx” onnx_model = onnx.load(model_path) print(" quantised model is loaded")

test_target = “vulkan” arch = “arm64” target = tvm.target.Target(“llvm -mtriple=arm64-linux-android”)

input_name = “data” shape_dict = {input_name: (1,3,384,384)} dtype_dict = {input_name: “float32”} print(" build the ir ")

mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype_dict) run_on_device = True #False for x86 True for arm64 target_host = None

target = tvm.target.Target(test_target, host=target) print(target)

rpc_tracker_host = os.environ.get(“TVM_TRACKER_HOST”, “127.0.0.1”) rpc_tracker_port = int(os.environ.get(“TVM_TRACKER_PORT”, 9190)) key = “android” if is_tuning: # Auto Tuning Stage 1: Extract tunable tasks tasks = autotvm.task.extract_from_program( mod, target=test_target, target_host=target, params=params )

# Auto Tuning Stage 2: Define tuning configuration
tmp_log_file = tune_log + ".tmp"
measure_option = autotvm.measure_option(
    builder=autotvm.LocalBuilder(
        build_func=ndk.create_shared, timeout=50
    ),  # Build the test kernel locally
    runner=autotvm.RPCRunner(  # The runner would be on a remote device.
        key,  # RPC Key
        host=rpc_tracker_host,  # Tracker host
        port=int(rpc_tracker_port),  # Tracker port
        number=1,  # Number of runs before averaging
        timeout=5000,  # RPC Timeout
    ),
)
n_trial = 50  # Number of iteration of training before choosing the best kernel config
early_stopping = False  # Can be enabled to stop tuning while the loss is not minimizing.

from tvm.autotvm.tuner import XGBTuner

for i, tsk in enumerate(reversed(tasks[:3])):
    print("Task:", tsk)
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

    # choose tuner
    tuner = "xgb"

    # create tuner
    if tuner == "xgb":
        tuner_obj = XGBTuner(tsk, loss_type="reg")
    elif tuner == "xgb_knob":
        tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob")
    elif tuner == "xgb_itervar":
        tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar")
    elif tuner == "xgb_curve":
        tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve")
    elif tuner == "xgb_rank":
        tuner_obj = XGBTuner(tsk, loss_type="rank")
    elif tuner == "xgb_rank_knob":
        tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
    elif tuner == "xgb_rank_itervar":
        tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
    elif tuner == "xgb_rank_curve":
        tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
    elif tuner == "xgb_rank_binary":
        tuner_obj = XGBTuner(tsk, loss_type="rank-binary")
    elif tuner == "xgb_rank_binary_knob":
        tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob")
    elif tuner == "xgb_rank_binary_itervar":
        tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar")
    elif tuner == "xgb_rank_binary_curve":
        tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve")
    elif tuner == "ga":
        tuner_obj = GATuner(tsk, pop_size=50)
    elif tuner == "random":
        tuner_obj = RandomTuner(tsk)
    elif tuner == "gridsearch":
        tuner_obj = GridSearchTuner(tsk)
    else:
        raise ValueError("Invalid tuner: " + tuner)

    tsk_trial = min(n_trial, len(tsk.config_space))
    tuner_obj.tune(
        n_trial=tsk_trial,
        early_stopping=early_stopping,
        measure_option=measure_option,
        callbacks=[
            autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
            autotvm.callback.log_to_file(tmp_log_file),
        ],
    )
# Auto Tuning Stage 4: Pick the best performing configurations from the overall log.
autotvm.record.pick_best(tmp_log_file, tune_log)

if os.path.exists(tune_log) and is_tuning: print(“with auto tune”) with autotvm.apply_history_best(tune_log): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) else: print(“without auto tune”) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params)

lib_fname = “dummy_model.tvm.so” if run_on_device: lib_fname=“dummy_model.tvm.so”

print(ndk) print (ndk.create_shared) fcompile = ndk.create_shared if run_on_device else None lib.export_library(lib_fname, fcompile)

print(lib_fname)

if not run_on_device: remote = rpc.LocalSession() else: tracker_host = os.environ.get(“TVM_TRACKER_HOST”, “127.0.0.1”) tracker_port = int(os.environ.get(“TVM_TRACKER_PORT”, 9190)) key = “android” tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=600)

ctx = remote.vulkan(0) print(ctx)

remote.upload(lib_fname) rlib = remote.load_module(lib_fname)

module = runtime.GraphModule(rlib"default")

module.run()

ftime = module.module.time_evaluator(“run”, ctx, number=1, repeat=10) prof_res = np.array(ftime().results) * 1000 print(“Mean inference time (std dev): %.2f ms (%.2f ms)” % (np.mean(prof_res), np.std(prof_res)))

@merrymercy , @eqy , Please support if possible. Thank you.