I tried to tune a simple unet 2D. But tvm model takes 11ms to infer and pytorch only 8ms.
My Pytorch version:pytorch 1.8.1 TVM is the main branch on github. Device is NVIDIA GeForce RTX 2070, drive 470.103.01.
My code
import tvm
from tvm import relay
from tvm import relay, auto_scheduler
from tvm.runtime.ndarray import device
shape = (1,1,256,256)
input0 = torch.randn(shape).half().cuda()
path = 'unet2d_fp16.trace'
trace = torch.jit.load(path).half().cuda()
relay_model, params = relay.frontend.from_pytorch(trace, [('input0',input0.shape)], default_dtype='float16')
target = tvm.target.cuda()
tasks, task_weights = auto_scheduler.extract_tasks(relay_model["main"], params, target)
measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1,
min_repeat_ms=100,
timeout=100)
tuner = auto_scheduler.TaskScheduler(tasks,
task_weights,
load_model_file='unet2d_fp16',
)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=36000,
num_measures_per_round=64,
early_stopping=500,
verbose=True,
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile('unet2d_fp16.log')],
)
tuner.tune(tune_option)
print("Compile...")
with auto_scheduler.ApplyHistoryBest("unet2d_fp16.log"):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(relay_model, target=target, params=params)
lib.export_library('unet2d_fp16.so')
What’s the point I’ve missed?