Hi, I am trying to optimize the model further using Auto tuning feature on Android device for vulkan backend. But the auto tuning script is giving me below error:
Task: Task(func_name=conv2d_nchw.cuda, args=((‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’), kwargs={}, workload=(‘conv2d_nchw.cuda’, (‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’)) [Task 1/33] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (50/50) | 44.78 sWARNING:root:Could not find any valid schedule for task Task(func_name=conv2d_nchw.cuda, args=((‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’), kwargs={}, workload=(‘conv2d_nchw.cuda’, (‘TENSOR’, (1, 32, 192, 192), ‘float32’), (‘TENSOR’, (24, 32, 1, 1), ‘float32’), (1, 1), (0, 0, 0, 0), (1, 1), ‘float32’)). A file containing the errors has been written to /tmp/tvm_tuning_errors_57uwldt1.log. Done.
Below errors are present in error log:
TVMError: Let statement of variable conv2d_nchw is missing a type annotation, or type annotation is not a pointer to primitive
raise InstantiationError(“Skipped because of invalid gpu kernel”) tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel.
Script:
is_tuning = True tune_log = “auto_vulkan.log”
model_path = “dummy_model.onnx” onnx_model = onnx.load(model_path) print(" quantised model is loaded")
test_target = “vulkan” arch = “arm64” target = tvm.target.Target(“llvm -mtriple=arm64-linux-android”)
input_name = “data” shape_dict = {input_name: (1,3,384,384)} dtype_dict = {input_name: “float32”} print(" build the ir ")
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype_dict) run_on_device = True #False for x86 True for arm64 target_host = None
target = tvm.target.Target(test_target, host=target) print(target)
rpc_tracker_host = os.environ.get(“TVM_TRACKER_HOST”, “127.0.0.1”) rpc_tracker_port = int(os.environ.get(“TVM_TRACKER_PORT”, 9190)) key = “android” if is_tuning: # Auto Tuning Stage 1: Extract tunable tasks tasks = autotvm.task.extract_from_program( mod, target=test_target, target_host=target, params=params )
# Auto Tuning Stage 2: Define tuning configuration
tmp_log_file = tune_log + ".tmp"
measure_option = autotvm.measure_option(
builder=autotvm.LocalBuilder(
build_func=ndk.create_shared, timeout=50
), # Build the test kernel locally
runner=autotvm.RPCRunner( # The runner would be on a remote device.
key, # RPC Key
host=rpc_tracker_host, # Tracker host
port=int(rpc_tracker_port), # Tracker port
number=1, # Number of runs before averaging
timeout=5000, # RPC Timeout
),
)
n_trial = 50 # Number of iteration of training before choosing the best kernel config
early_stopping = False # Can be enabled to stop tuning while the loss is not minimizing.
from tvm.autotvm.tuner import XGBTuner
for i, tsk in enumerate(reversed(tasks[:3])):
print("Task:", tsk)
prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
# choose tuner
tuner = "xgb"
# create tuner
if tuner == "xgb":
tuner_obj = XGBTuner(tsk, loss_type="reg")
elif tuner == "xgb_knob":
tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob")
elif tuner == "xgb_itervar":
tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar")
elif tuner == "xgb_curve":
tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve")
elif tuner == "xgb_rank":
tuner_obj = XGBTuner(tsk, loss_type="rank")
elif tuner == "xgb_rank_knob":
tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
elif tuner == "xgb_rank_itervar":
tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
elif tuner == "xgb_rank_curve":
tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
elif tuner == "xgb_rank_binary":
tuner_obj = XGBTuner(tsk, loss_type="rank-binary")
elif tuner == "xgb_rank_binary_knob":
tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob")
elif tuner == "xgb_rank_binary_itervar":
tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar")
elif tuner == "xgb_rank_binary_curve":
tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve")
elif tuner == "ga":
tuner_obj = GATuner(tsk, pop_size=50)
elif tuner == "random":
tuner_obj = RandomTuner(tsk)
elif tuner == "gridsearch":
tuner_obj = GridSearchTuner(tsk)
else:
raise ValueError("Invalid tuner: " + tuner)
tsk_trial = min(n_trial, len(tsk.config_space))
tuner_obj.tune(
n_trial=tsk_trial,
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
autotvm.callback.log_to_file(tmp_log_file),
],
)
# Auto Tuning Stage 4: Pick the best performing configurations from the overall log.
autotvm.record.pick_best(tmp_log_file, tune_log)
if os.path.exists(tune_log) and is_tuning: print(“with auto tune”) with autotvm.apply_history_best(tune_log): with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) else: print(“without auto tune”) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params)
lib_fname = “dummy_model.tvm.so” if run_on_device: lib_fname=“dummy_model.tvm.so”
print(ndk) print (ndk.create_shared) fcompile = ndk.create_shared if run_on_device else None lib.export_library(lib_fname, fcompile)
print(lib_fname)
if not run_on_device: remote = rpc.LocalSession() else: tracker_host = os.environ.get(“TVM_TRACKER_HOST”, “127.0.0.1”) tracker_port = int(os.environ.get(“TVM_TRACKER_PORT”, 9190)) key = “android” tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=600)
ctx = remote.vulkan(0) print(ctx)
remote.upload(lib_fname) rlib = remote.load_module(lib_fname)
module = runtime.GraphModule(rlib"default")
module.run()
ftime = module.module.time_evaluator(“run”, ctx, number=1, repeat=10) prof_res = np.array(ftime().results) * 1000 print(“Mean inference time (std dev): %.2f ms (%.2f ms)” % (np.mean(prof_res), np.std(prof_res)))