Hello,
I am currently attempting to benchmark the execution times of various neural networks in TVM on my computer. However, whenever I quantize a network to 8 bits, it runs significantly slower than when it is executed at full precision.
For example, when I quantize resnet-18 to 8 bits and tune, the mean execution time is ~160 ms. But when I do not quantize, the execution time is ~90 ms. I would expect the 8 bit model to be much faster, but this does not seem to be the case.
Has anyone encountered this issue before? Are 8 bit operations not optimized in TVM? I am new to TVM, so perhaps there is a simple oversight I am making.
I will provide reproducible code below. If anyone has any insights or can help, I would greatly appreciate it.
import os
import numpy as np
import tvm
from tvm import te
from tvm import relay
from tvm import autotvm
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
import tvm.contrib.graph_runtime as runtime
from tvm.relay import testing
import tvm.relay.quantize as quantize
target = tvm.target.Target('llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon,+neoversen1,+i8mm')
network = "resnet-18"
batch_size = 1
dtype = "float32"
layout = "NCHW"
log_file = "%s.log" % network
graph_opt_sch_file = "%s_graph_opt.log" % network
input_shape = (batch_size, 3, 224, 224)
output_shape = (batch_size, 1000)
mod, params = relay.testing.resnet.get_workload(
num_layers=18, batch_size=batch_size, layout=layout
)
### The quantization in question
with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
mod = relay.quantize.quantize(mod, params)
tasks = autotvm.task.extract_from_program(
mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
)
tuning_option = {
"log_filename": log_file,
"tuner": "xgb",
"n_trial": 100,
"early_stopping": 800,
"measure_option": autotvm.measure_option(
builder=autotvm.LocalBuilder(timeout=10),
runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150),
),
}
for i, task in enumerate(tasks):
print("Tuning task %d/%d" % (i + 1, len(tasks)))
tuner = XGBTuner(task)
tuner.tune(
n_trial=min(tuning_option["n_trial"], len(task.config_space)),
early_stopping=tuning_option["early_stopping"],
measure_option=tuning_option["measure_option"],
callbacks=[
autotvm.callback.progress_bar(tuning_option["n_trial"]),
autotvm.callback.log_to_file(tuning_option["log_filename"]),
],
)
with autotvm.apply_history_best(log_file):
with tvm.transform.PassContext(opt_level=3):
graph, lib, params = relay.build_module.build(
mod, target=target, params=params
)
ctx = tvm.device(str(target), 0)
module = runtime.create(graph, lib, ctx)
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input("data", data_tvm)
module.set_input(**params)
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60)
prof_res = np.array(ftimer().results) * 1000 # (convert to milliseconds)
avg_inference_time = np.mean(prof_res)
std_dev = np.std(prof_res)
print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (avg_inference_time, std_dev))
autotvm.record.pick_best(log_file, 'mean_latency')
with open('avg_inference_time.txt', 'w') as f:
f.write(f'Average Inference Time: {avg_inference_time} ms with std dev {std_dev} ms\n')