I’m seeing 50X performance regression between the relay.build_module.build(mod, target, params=params) and relay.build_module.create_executor(‘vm’, mod, ctx, target).
Here is the script to reproduce in mlp instance:
import numpy as np
from tvm import relay
from tvm.relay import testing
import tvm
batch_size = 1
num_class = 10
image_shape=(1, 28, 28) # for mlp
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)
mod, params = relay.testing.mlp.get_workload(
batch_size=batch_size, image_shape=image_shape)
opt_level = 3
target = "llvm"
ctx = tvm.cpu()
with relay.build_config(opt_level=opt_level):
executor = relay.build_module.create_executor('graph', mod, ctx, target)
dtype = 'float32'
data = tvm.nd.array(np.random.uniform(-1, 1, size=data_shape).astype("float32"))
tvm_out = executor.evaluate()(data, **params)
out =tvm_out.asnumpy()
print(out.flatten()[0:10])
test_times = 100
import time
start = time.time()
for i in range(test_times):
executor.evaluate()(data, **params)
end = time.time()
print("Mean inference time", (end - start) / test_times * 1000)
##########################################################
import numpy as np
from tvm import relay
from tvm.relay import testing
import tvm
batch_size = 1
num_class = 10
image_shape=(1, 28, 28) # for mlp
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)
mod, params = relay.testing.mlp.get_workload(
batch_size=batch_size, image_shape=image_shape)
print(mod.astext(show_meta_data=False))
opt_level = 3
target = "llvm"
ctx = tvm.cpu()
with relay.build_config(opt_level=opt_level):
executor = relay.build_module.create_executor('vm', mod, ctx, target)
dtype = "float32"
data = tvm.nd.array(np.random.uniform(-1, 1, size=data_shape).astype("float32"))
tvm_out = executor.evaluate()(data, **params)
out = tvm_out.asnumpy()
test_times = 100
import time
start = time.time()
for i in range(test_times):
executor.evaluate()(data, **params)
end = time.time()
print("Mean inference time", (end - start) / test_times * 1000)
@ wweic