Recently I noticed relay did a optimization which replace relay.nn.conv2d with relay.nn.contrib_conv2d_winograd_without_weight_transform ,so I run a script to show the time gap between them.The code sinnpet is following:
import numpy as np
import tvm
from tvm import relay
import time
import timeit
from tvm.contrib import graph_executor
def relay_conv2d(data_np,weight_np):
net = relay.nn.conv2d(relay.const(data_np),relay.const(weight_np),(1,1),(1,1),
data_layout='NCHW', kernel_layout='OIHW')
mod = tvm.IRModule.from_expr(net)
mod = relay.transform.InferType()(mod)
target = tvm.target.Target('cuda')
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target)
print(mod["main"].astext(show_meta_data=False))
dev = tvm.cuda()
module = graph_executor.GraphModule(lib["default"](dev))
# module.run()
timing_number = 100
timing_repeat = 100
t = (
np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
* 1000
/ timing_number
)
return np.mean(t)
def relay_winograd(data_np,weight_np,tile_size):
# weight = relay.nn.contrib_conv2d_winograd_weight_transform(relay.const(weight_np), 4)
c = weight_np.shape[3]
weight = relay.const(weight_np)
net = relay.nn.contrib_conv2d_winograd_without_weight_transform(
data=relay.const(data_np),
weight=weight,
tile_size=tile_size,
strides=(1,1),
padding=(1,1),
channels=c,
kernel_size=(3,3),
data_layout='NHWC',
kernel_layout='HWIO',
out_dtype='float32')
mod = tvm.IRModule.from_expr(net)
mod = relay.transform.InferType()(mod)
target = tvm.target.Target('cuda')
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target)
print(mod["main"].astext(show_meta_data=False))
dev = tvm.cuda()
module = graph_executor.GraphModule(lib["default"](dev))
timing_number = 100
timing_repeat = 100
t = (
np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
* 1000
/ timing_number
)
return np.mean(t)
if __name__=="__main__":
size = 28
channel = 128
data = np.random.uniform(size=(1,channel,size,size)).astype(np.float32)
weight = np.random.uniform(size=(channel,channel,3,3)).astype(np.float32)
print(relay_conv2d(data,weight))
tile_size = 4# only support 4
khat = 2+tile_size
data = np.random.uniform(size=(1,size,size,channel)).astype(np.float32)
weight = np.random.uniform(size=(khat,khat,channel,channel)).astype(np.float32)
print(relay_winograd(data,weight,tile_size))
And I noticed sometimes winograd is slower than naive,and most of time they are at similar time consuming.The same result is happenedd when I run te script (from topi) and tuning it with Ansor. I don’t know why?