Pytorch inference time faster than auto tuning TVM

Hi everyone,

I am trying to write some code to auto tuning a pretrained quantized mobilenetv2 network, so it can be run faster in the raspberry pi 4. I can tune the model, build and run the compiled model in the hardware, but the inference time is slower that using the vanilla model with pytorch. I will appreciate if someone can check my code and tell me if there is something wrong or missing. Than you

import os
import time
import tvm
from tvm import relay, autotvm
import tvm.relay.testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
import tvm.contrib.graph_executor as runtime
from PIL import Image
import numpy as np
import torch
from torchvision.models.quantization import mobilenet as qmobilenet
from tvm.contrib.download import download_testdata
import torchvision.transforms as transforms

target = tvm.target.arm_cpu("rasp4b64")
device_key = "rk3399"
use_android = False

#### TUNING OPTION ####
network = "mobilenet_v2"
log_file = "%s.%s.log" % (device_key, network)
tuningDataFile = "%s.%s.json" % (device_key, network)
hostIP ="10.46.30.51"
dtype = "float32"

tuning_option = {
    "log_filename": log_file,
    "tuner": "xgb",
    "n_trial": 1500,
    "early_stopping": 800,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"),
        runner=autotvm.RPCRunner(
            device_key,
            host=hostIP,
            port=9190,
            number=5, # specifies the number of different configurations that we will test
            repeat = 1,
            min_repeat_ms = 0,  # since we're tuning on a CPU, can be set to 0
            timeout=10,
        ),
    ),
}

def tune_tasks(
    tasks,
    measure_option,
    tuner="xgb",
    n_trial=1000,
    early_stopping=None,
    log_filename="tuning.log",
    use_transfer_learning=True,
):
    # create tmp log file
    tmp_log_file = log_filename + ".tmp"
    print(tmp_log_file)

    if os.path.exists(tmp_log_file):
        os.remove(tmp_log_file)

    for i, tsk in enumerate(reversed(tasks)):
        now = time.time()
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        # create tuner
        if tuner == "xgb" or tuner == "xgb-rank":
            tuner_obj = XGBTuner(tsk, loss_type="rank")
        elif tuner == "xgb_knob":
            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
        elif tuner == "xgb_itervar":
            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar")
        elif tuner == "xgb_curve":
            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve")
        elif tuner == "ga":
            tuner_obj = GATuner(tsk, pop_size=50)
        elif tuner == "random":
            tuner_obj = RandomTuner(tsk)
        elif tuner == "gridsearch":
            tuner_obj = GridSearchTuner(tsk)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        if use_transfer_learning:
            if os.path.isfile(tmp_log_file):
                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

        # process tuning
        tsk_trial = min(n_trial, len(tsk.config_space))
        tuner_obj.tune(
            n_trial=tsk_trial,
            early_stopping=early_stopping,
            measure_option=measure_option,
            callbacks=[
                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
                autotvm.callback.log_to_file(tmp_log_file),
            ],
        )

        diff = int(time.time() - now)/60
        print(f'Completed task {i} in {diff:.2f} min')

    # pick best records to a cache file
    autotvm.record.pick_best(tmp_log_file, log_filename)

    os.remove(tmp_log_file)


def tune_and_evaluate(tuning_opt, mod, params, input_shape, input_name):
    # extract workloads from relay program
    print("Extract tasks...")
    # mod, params, input_shape, _ = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_program(
        mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
    )

    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with tvm.transform.PassContext(opt_level=3):
            lib = relay.build_module.build(mod, target=target, params=params)

        # export library
        # tmp = tempdir()
        pathSaveLib = "models_generated/"
        if use_android:
            from tvm.contrib import ndk

            filename = "net_dep_and_opt.so"
            # lib.export_library(tmp.relpath(filename), ndk.create_shared)
            lib.export_library(pathSaveLib + filename, ndk.create_shared)
        else:
            filename = "net_dep_and_opt.tar"
            # lib.export_library(tmp.relpath(filename))
            lib.export_library(pathSaveLib + filename)

        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key, hostIP, 9190, timeout=10000)
        remote.upload(pathSaveLib + filename)

        rlib = remote.load_module(filename)

        # upload parameters to device
        dev = remote.device(str(target), 0)
        module = runtime.GraphModule(rlib["default"](dev))
        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input(input_name, data_tvm)

        # evaluate
        print("Evaluate inference time cost...")
        print(module.benchmark(dev, number=1, repeat=10))


def quantize_model(model, inp):
    model.fuse_model()
    model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
    torch.quantization.prepare(model, inplace=True)
    # Dummy calibration
    model(inp)
    torch.quantization.convert(model, inplace=True)


def get_transform():
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    return transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]
    )


def get_real_image(im_height, im_width):
    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
    img_path = download_testdata(img_url, "cat.png", module="data")
    return Image.open(img_path).resize((im_height, im_width))


def get_imagenet_input():
    im = get_real_image(224, 224)
    preprocess = get_transform()
    pt_tensor = preprocess(im)
    return np.expand_dims(pt_tensor.numpy(), 0)


qmodel = qmobilenet.mobilenet_v2(pretrained=True).eval()

inp = get_imagenet_input()
pt_inp = torch.from_numpy(inp)
quantize_model(qmodel, pt_inp)
script_module = torch.jit.trace(qmodel, pt_inp).eval()

with torch.no_grad():
    pt_result = script_module(pt_inp).numpy()

input_name = "input"  # the input name can be be arbitrary for PyTorch frontend.
input_shape = (1, 3, 224, 224)
input_shapes = [(input_name, input_shape)]
mod, params = relay.frontend.from_pytorch(script_module, input_shapes)

tune_and_evaluate(tuning_option, mod, params, input_shape, input_name)

Nobody knows ??? for real !!!