Fine tuned OpenCL gives incorrect outputs

When I auto schedule a model for a remote device with a Mali GPU the outputs of the compiled fine tuned model are significantly different than the outputs of the model compiled without auto scheduling or compiled for the CPU of the device. The following code reproduces this problem:

import os
import tvm
from tvm.contrib import utils, ndk, graph_executor
import numpy as np
from tvm import relay, auto_scheduler, rpc
import tvm.relay.testing
from import get_model

if __name__ == "__main__":
    block = get_model("mobilenet1.0", pretrained=True)
    shape_dict = {"data": (1, 3, 224, 224)}
    mod, params = relay.frontend.from_mxnet(block, shape_dict)
    tracker_host = ""
    tracker_port = 9000
    key = 'android'
    os.environ["TVM_NDK_CC"] = '/users/nkaminsky/library/android/android-toolchain-arm64/bin/aarch64-linux-android-g++'
    output_file = 'auto_scheduled_model_mxnet.json'
    target ='opencl -device=mali', 'llvm -mtriple=arm64-linux-android')
    tasks, task_weights = tvm.auto_scheduler.extract_tasks(mod['main'], params, target)
    tuner = tvm.auto_scheduler.TaskScheduler(tasks, task_weights)

    builder = tvm.auto_scheduler.LocalBuilder(build_func='ndk')
    runner = tvm.auto_scheduler.RPCRunner(key=key, host=tracker_host, port=tracker_port, priority=0, number=3, repeat=1)
    tune_options = tvm.auto_scheduler.TuningOptions(num_measure_trials=35,
    with tvm.auto_scheduler.ApplyHistoryBest(output_file):
        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
            lib =, target=target, params=params)
    tracker = rpc.connect_tracker(tracker_host, tracker_port)
    remote = tracker.request(key, priority=0, session_timeout=0)
    device =
    tmp = utils.tempdir()
    lib_fname = tmp.relpath(f"")
    fcompile = ndk.create_shared
    lib.export_library(lib_fname, fcompile)
    exported_lib = remote.load_module(f"")
    module = graph_executor.GraphModule(exported_lib["default"](device))

    # test output:
    local_device = tvm.device(str('llvm')), 0)
    with tvm.transform.PassContext(opt_level=3):
        unoptimized_lib =,'llvm'), params=params)
    rng = np.random.default_rng(seed=0)
    unoptimized_module = graph_executor.GraphModule(unoptimized_lib["default"](local_device))
    dummy_input = {'input': rng.random([1, 3, 224, 224]).astype("float32")}
    tvm_dummy = {key: tvm.nd.array(input) for key, input in dummy_input.items()}
    module_output = module.get_output(0).numpy()
    unoptimized_module_output = unoptimized_module.get_output(0).numpy()
    diff = np.abs(module_output - unoptimized_module_output)
    assert (diff <= 1e-3).all(), f'maximum element difference: {np.amax(diff)}, l2 diff: {np.linalg.norm(diff)}'

The numerical error I get for a fine tuned model is around L2 = 35. However, if I build the module without the json file from auto scheduling the assert passes (L2 of less than 1). Any help will be greatly appreciated!