(The missing example of) How to use TVM's MetaSchdule to optimize your ONNX model and save as .tar and run it with Python

happyme531 · January 11, 2024, 8:41pm

(These code is recommended to run in a jupyter notebook so you won’t lose tuning record for example if your code crashed midway)

Imports

import tvm
from tvm import relay
from tvm.driver.tvmc.transform import apply_graph_transforms
import onnx
import tvm.testing
import tvm.topi.testing
from tvm import meta_schedule as ms
from tvm.meta_schedule.runner import (
    EvaluatorConfig,
    LocalRunner,
    PyRunner,
    RPCConfig,
    RPCRunner,
)

Config

target = tvm.target.Target("opencl -device=mali", host="llvm -mtriple=aarch64-linux-gnu") # Your target, here is Mali GPU, change it to your target
onnx_model_path = "./yourmodel.onnx" # Your ONNX model just trained and ready to be deployed!!

Run tuning

onnx_model = onnx.load(onnx_model_path)
mod,params = relay.frontend.from_onnx(onnx_model)

# if you want to use mixed/fp16 precision, uncomment the following lines
# mod = apply_graph_transforms(
#     mod,
#     {
#         "mixed_precision": True,
#         "mixed_precision_calculation_type": "float16",
#         "mixed_precision_acc_type": "float16",
#     },
# )

# RPC config for remote device
rpc_config = RPCConfig(
    tracker_host="127.0.0.1",
    tracker_port=9190,
    tracker_key="rk3588", 
    session_priority=1,
    session_timeout_sec=10,
)

# Evaluator config. You can change the number of trials, repeat times, etc.
evaluator_config = EvaluatorConfig(
    number=10,
    repeat=1,
    min_repeat_ms=1,
    enable_cpu_cache_flush=True,
)

# Use RPC runner to run on remote device
runner = RPCRunner(rpc_config, evaluator_config)
# Or use local runner
# runner = LocalRunner(evaluator_config)

# Start tuning and search for the best schedule
# Does not know how to resume a interrupted search, plz let me know if you know how to do it
database = ms.relay_integration.tune_relay(
    mod=mod,
    params=params,
    target=target,
    max_trials_global=10000,  # larger value for better performance, but take longer time to search
    runner=runner,
    work_dir="./work",
    seed=0
)

Compile

# Compile the best schedule
lib = ms.relay_integration.compile_relay(
    database=database,
    mod=mod,
    params=params,
    target=target,
)

Export

import tvm.driver.tvmc.model as tvmc_model

# export lib, graph and parameters
model = tvmc_model.TVMCModel(mod, params)
# export package
model.export_package(lib, onnx_model_path.replace(".onnx", ".tar"), "aarch64-linux-gnu-gcc") # change it to your cross compiler or leave it empty

Run

import tvm
from tvm.driver import tvmc
from tvm.contrib import graph_executor
import time

package = tvmc.TVMCPackage("yourmodel.tar")

# tvmc.run is extremely slow!! 
# result = tvmc.run(package, device="cl", inputs={"input1": tvm.nd.array(your_input_tensor)})

lib = tvm.runtime.load_module(package.lib_path)
executor = graph_executor.create(package.graph, lib, tvm.cl()) # Change to tvm.cpu() if you want to run on CPU
executor.load_params(package.params)

executor.run() # warm up

inp = {"input1": tvm.nd.array(your_input_tensor)}
executor.set_input(**inp)
start_time = time.time()
executor.run()
result = executor.get_output(0).numpy()
end_time = time.time()

print(result)
print("time: ", end_time-start_time)