I test the model only with one dcn layer, ref by there, and I test it on Nvidia Jetson NX board.
Related software version:
pytorch 1.6.0 aarch64 version, torchvision 0.7.0
tvm main branch, whl file gives the exact version (tvm-0.8.dev1687+gf6a404447-cp36)
there is the time for inference evaluation:
with target = ‘llvm -mtriple=aarch64-linux-gnu’–>inference time (std dev): 197.94 ms (3.08 ms)
with target = tvm.target.cuda(“nx”)–>inference time (std dev): 481.27 ms (10.03 ms)
Did i do something wrong?
Code snippet
import os
import numpy as np
import torch
import torchvision
from torchvision.ops import DeformConv2d as dcn
import tvm
from tvm import relay
from tvm.relay import testing
import tvm.contrib.graph_executor as graph_runtime
if __name__ == "__main__":
dtype = "float32"
#target = 'llvm -mtriple=aarch64-linux-gnu'
target = tvm.target.cuda("nx")
target_host = 'llvm -mtriple=aarch64-linux-gnu'
cur_data = torch.randn([1, 3, 224, 224]).cuda()
cur_data_np = cur_data.cpu().detach().numpy()
cur_offset = torch.randn([1, 18, 224, 224]).cuda()
cur_offset_np = cur_offset.cpu().detach().numpy()
cur_weight = torch.randn([64, 3, 3, 3]).cuda()
cur_weight_np = cur_weight.cpu().detach().numpy()
# pytorch torchvision
pytorch_model = dcn(3, 64, 3, padding=1, bias=False).cuda().eval()
pytorch_model.weight = torch.nn.Parameter(cur_weight)
scripted_model = torch.jit.trace(
pytorch_model, [cur_data, cur_offset]
).eval()
pytorch_res = pytorch_model(cur_data, cur_offset)
# tvm from pytorch
shape_list = [("input0", [1, 3, 224, 224]), ("input1", [1, 18, 224, 224])]
mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
ctx = tvm.device(str(target), 0)
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(
mod, target=target, target_host=target_host, params=params
)
m = graph_runtime.GraphModule(lib["default"](ctx))
m.set_input(0, tvm.nd.array(cur_data_np, ctx))
m.set_input(1, tvm.nd.array(cur_offset_np, ctx))
m.run()
tvm_output = m.get_output(0)
# evaluate
print("Evaluate inference time cost on host...")
ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=100)
prof_res = np.array(ftimer().results) * 1000
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
(np.mean(prof_res), np.std(prof_res)))