When I profile with htop or other profiling applications (like VTune), the parallelism is close to 1, unlike the Relay VM, which shows performance closer to the number of threads. Here is my code:
import tvm
from tvm import relax
import torch
import numpy as np
from torch import fx
from tvm.relax.frontend.torch import from_fx
from torchvision.models.resnet import ResNet18_Weights, resnet18
import os
import tempfile
num_threads = 16
os.environ["TVM_NUM_THREADS"] = str(num_threads)
device = tvm.cpu(0)
target = tvm.target.Target('llvm')
torch_model = resnet18(weights=ResNet18_Weights.DEFAULT)
# Give the input shape and data type
input_info = [((16, 3, 224, 224), "float32")]
# Convert the model to IRModule
with torch.no_grad():
torch_fx_model = fx.symbolic_trace(torch_model)
mod = from_fx(torch_fx_model, input_info)
ex = relax.build(mod, target=target)
vm = relax.VirtualMachine(ex, device=device)
gpu_data = tvm.nd.array(np.random.rand(16, 3, 224, 224).astype("float32"), device)
gpu_out = vm["main"](gpu_data).numpy()
print(gpu_out.shape)