Some other issue is that tvm is significantly slower as pytorch. I believe this is because I tvm is still using cpu
even though I set the target to cuda
. I get similar speeds when I used cpu
compared to cuda
. For the example I gave pytorch takes 0.005
seconds while tvm took 0.015
and this difference becomes bigger for larger models. I’ve also tried to tune the model but I don’t get any performance gains. I also get some warnings. The code I use:
import torch
from tvm import relay
import tvm
from torch import nn
from tvm.contrib.debugger.debug_executor import GraphModuleDebug
import time
HALF: bool = False
TUNING: bool = True
class Model(nn.Module):
def __init__(
self,
):
super(Model, self).__init__()
self.c1 = nn.Conv3d(1, 1, 3, padding=1)
self.relu1 = nn.ReLU()
self.pool = nn.GroupNorm(num_groups=1, num_channels=1)
self.final_activation = nn.Sigmoid()
self.apply(init_params)
def forward(self, x):
x = self.c1(x)
x = self.relu1(x)
x = self.pool(x)
x = self.final_activation(x)
return x
def init_params(m):
if hasattr(m, "weight"):
m.weight.data = torch.randn(m.weight.size()) * 0.1
if hasattr(m, "bias"):
m.bias.data = torch.randn(m.bias.size()) * 0.1
model = Model().cuda(0)
if HALF:
model = model.half()
input_shape = (1, 1, 96, 96, 96)
input_name = [("input", (input_shape, "float16" if HALF else "float32"))]
input_data = torch.randn(input_shape)
if HALF:
input_data = input_data.half()
modelScripted = torch.jit.trace(model, input_data.cuda())
mod, params = relay.frontend.from_pytorch(
modelScripted, input_name, default_dtype=("float16" if HALF else "float32")
)
target = tvm.target.Target("cuda")
dev = tvm.device("cuda", 0)
if not TUNING:
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target=target, params=params)
else:
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm
number = 10
repeat = 1
min_repeat_ms = 100
timeout = 100 # in seconds
# create a TVM runner
runner = autotvm.LocalRunner(
number=number,
repeat=repeat,
timeout=timeout,
min_repeat_ms=min_repeat_ms,
enable_cpu_cache_flush=True,
)
tuning_option = {
"tuner": "xgb",
"trials": 20,
"early_stopping": 100,
"measure_option": autotvm.measure_option(
builder=autotvm.LocalBuilder(build_func="default"), runner=runner
),
"tuning_records": "tuning_records.json", # TODO : change, log file
}
# begin by extracting the tasks from the onnx model
tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)
# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
tuner_obj = XGBTuner(task, loss_type="rank", num_threads=1)
tuner_obj.tune(
n_trial=min(tuning_option["trials"], len(task.config_space)),
early_stopping=tuning_option["early_stopping"],
measure_option=tuning_option["measure_option"],
callbacks=[
autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
autotvm.callback.log_to_file(tuning_option["tuning_records"]),
],
)
with autotvm.apply_history_best(tuning_option["tuning_records"]):
with tvm.transform.PassContext(opt_level=3, config={}):
lib = relay.build(mod, target=target, params=params)
# executing
from tvm.contrib import graph_executor
dtype = "float16" if HALF else "float32"
m = graph_executor.GraphModule(lib["default"](dev))
start = time.time()
cinput = tvm.nd.array(input_data)
cinput = cinput.copyto(dev)
m.set_input("input", cinput)
m.run()
tvm_output = m.get_output(0).asnumpy()
print(f"tvm took: {time.time() - start}")
start = time.time()
pytorch_output = modelScripted.forward(input_data.cuda()).detach().cpu().numpy()
print(f"pytorch took: {time.time() - start}")
print(pytorch_output.flatten()[:10])
print(tvm_output.flatten()[:10])
print("DONE")
I get these logs while training:
[Task 1/ 2] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (20/20) | 16.15 sWARNING:root:Could not find any valid schedule for task Task(func_name=conv3d_ncdhw.cuda, args=(('TENSOR', (1, 1, 96, 96, 96), 'float32'), ('TENSOR', (1, 1, 3, 3, 3), 'float32'), (1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1), 1, 'float32'), kwargs={}, workload=('conv3d_ncdhw.cuda', ('TENSOR', (1, 1, 96, 96, 96), 'float32'), ('TENSOR', (1, 1, 3, 3, 3), 'float32'), (1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1), 1, 'float32')). A file containing the errors has been written to /tmp/tvm_tuning_errors_8w8xy05f.log.
[Task 2/ 2] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (20/20) | 36.38 sWARNING:root:Could not find any valid schedule for task Task(func_name=conv3d_ncdhw_winograd.cuda, args=(('TENSOR', (1, 1, 96, 96, 96), 'float32'), ('TENSOR', (1, 1, 3, 3, 3), 'float32'), (1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1), 1, 'float32'), kwargs={}, workload=('conv3d_ncdhw_winograd.cuda', ('TENSOR', (1, 1, 96, 96, 96), 'float32'), ('TENSOR', (1, 1, 3, 3, 3), 'float32'), (1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1), 1, 'float32')). A file containing the errors has been written to /tmp/tvm_tuning_errors_ao2ydcb0.log.
and here’s the content of /tmp/tvm_tuning_errors_8w8xy05f.log
: Traceback (most recent call last): File "/root/.local/lib/python3.8/site-pack - Pastebin.com