Did winograd relly speed up?

Recently I noticed relay did a optimization which replace relay.nn.conv2d with relay.nn.contrib_conv2d_winograd_without_weight_transform ,so I run a script to show the time gap between them.The code sinnpet is following:

import numpy as np
import tvm
from tvm import relay
import time
import timeit
from tvm.contrib import graph_executor

def relay_conv2d(data_np,weight_np):
	net = relay.nn.conv2d(relay.const(data_np),relay.const(weight_np),(1,1),(1,1),
		data_layout='NCHW', kernel_layout='OIHW')
	mod = tvm.IRModule.from_expr(net)
	mod = relay.transform.InferType()(mod)
	target = tvm.target.Target('cuda')
	with tvm.transform.PassContext(opt_level=3):
	    lib = relay.build(mod, target)
	print(mod["main"].astext(show_meta_data=False))
	dev = tvm.cuda()
	module = graph_executor.GraphModule(lib["default"](dev))
	# module.run()
	timing_number = 100
	timing_repeat = 100
	t = (
	np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
	* 1000
	/ timing_number
	)
	return np.mean(t)	


def relay_winograd(data_np,weight_np,tile_size):
	# weight = relay.nn.contrib_conv2d_winograd_weight_transform(relay.const(weight_np), 4)
	c = weight_np.shape[3]
	weight = relay.const(weight_np)
	net = relay.nn.contrib_conv2d_winograd_without_weight_transform(
		data=relay.const(data_np),
		weight=weight,
		tile_size=tile_size,
		strides=(1,1),
		padding=(1,1),
		channels=c,
		kernel_size=(3,3),
		data_layout='NHWC',
		kernel_layout='HWIO',
		out_dtype='float32')
	mod = tvm.IRModule.from_expr(net)
	mod = relay.transform.InferType()(mod)
	target = tvm.target.Target('cuda')
	with tvm.transform.PassContext(opt_level=3):
	    lib = relay.build(mod, target)
	print(mod["main"].astext(show_meta_data=False))
	dev = tvm.cuda()
	module = graph_executor.GraphModule(lib["default"](dev))
	timing_number = 100
	timing_repeat = 100
	t = (
	np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
	* 1000
	/ timing_number
	)
	return np.mean(t)	

if __name__=="__main__":
	size = 28
	channel = 128
	data = np.random.uniform(size=(1,channel,size,size)).astype(np.float32)
	weight = np.random.uniform(size=(channel,channel,3,3)).astype(np.float32)
	print(relay_conv2d(data,weight))

	tile_size = 4# only support 4
	khat = 2+tile_size
	data = np.random.uniform(size=(1,size,size,channel)).astype(np.float32)
	weight = np.random.uniform(size=(khat,khat,channel,channel)).astype(np.float32)
	print(relay_winograd(data,weight,tile_size))

And I noticed sometimes winograd is slower than naive,and most of time they are at similar time consuming.The same result is happenedd when I run te script (from topi) and tuning it with Ansor. I don’t know why?