When I auto schedule a model for a remote device with a Mali GPU the outputs of the compiled fine tuned model are significantly different than the outputs of the model compiled without auto scheduling or compiled for the CPU of the device. The following code reproduces this problem:
import os
import tvm
from tvm.contrib import utils, ndk, graph_executor
import numpy as np
from tvm import relay, auto_scheduler, rpc
import tvm.relay.testing
from mxnet.gluon.model_zoo.vision import get_model
if __name__ == "__main__":
block = get_model("mobilenet1.0", pretrained=True)
shape_dict = {"data": (1, 3, 224, 224)}
mod, params = relay.frontend.from_mxnet(block, shape_dict)
tracker_host = "0.0.0.0"
tracker_port = 9000
key = 'android'
os.environ["TVM_NDK_CC"] = '/users/nkaminsky/library/android/android-toolchain-arm64/bin/aarch64-linux-android-g++'
output_file = 'auto_scheduled_model_mxnet.json'
target = tvm.target.Target('opencl -device=mali', 'llvm -mtriple=arm64-linux-android')
tasks, task_weights = tvm.auto_scheduler.extract_tasks(mod['main'], params, target)
tuner = tvm.auto_scheduler.TaskScheduler(tasks, task_weights)
builder = tvm.auto_scheduler.LocalBuilder(build_func='ndk')
runner = tvm.auto_scheduler.RPCRunner(key=key, host=tracker_host, port=tracker_port, priority=0, number=3, repeat=1)
tune_options = tvm.auto_scheduler.TuningOptions(num_measure_trials=35,
builder=builder,
runner=runner,
measure_callbacks=[tvm.auto_scheduler.RecordToFile(output_file)])
tuner.tune(tune_options)
with tvm.auto_scheduler.ApplyHistoryBest(output_file):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)
tracker = rpc.connect_tracker(tracker_host, tracker_port)
remote = tracker.request(key, priority=0, session_timeout=0)
device = remote.cl(0)
tmp = utils.tempdir()
lib_fname = tmp.relpath(f"net.so")
fcompile = ndk.create_shared
lib.export_library(lib_fname, fcompile)
remote.upload(lib_fname)
exported_lib = remote.load_module(f"net.so")
module = graph_executor.GraphModule(exported_lib["default"](device))
# test output:
local_device = tvm.device(str(tvm.target.Target('llvm')), 0)
with tvm.transform.PassContext(opt_level=3):
unoptimized_lib = relay.build(mod, target=tvm.target.Target('llvm'), params=params)
rng = np.random.default_rng(seed=0)
unoptimized_module = graph_executor.GraphModule(unoptimized_lib["default"](local_device))
dummy_input = {'input': rng.random([1, 3, 224, 224]).astype("float32")}
tvm_dummy = {key: tvm.nd.array(input) for key, input in dummy_input.items()}
module.set_input(**tvm_dummy)
module.run()
module_output = module.get_output(0).numpy()
unoptimized_module.set_input(**tvm_dummy)
unoptimized_module.run()
unoptimized_module_output = unoptimized_module.get_output(0).numpy()
diff = np.abs(module_output - unoptimized_module_output)
assert (diff <= 1e-3).all(), f'maximum element difference: {np.amax(diff)}, l2 diff: {np.linalg.norm(diff)}'
The numerical error I get for a fine tuned model is around L2 = 35. However, if I build the module without the json file from auto scheduling the assert passes (L2 of less than 1). Any help will be greatly appreciated!