I’m seeing 20000X performance regression between the relay.build_module.build(mod, target, params=params) and relay.build_module.create_executor(‘graph’, mod, ctx, target).
Here is the script to reproduce in mlp instance:
import numpy as np
from tvm import relay
from tvm.relay import testing
import tvm
batch_size = 1
num_class = 10
image_shape=(1, 28, 28) # for mlp
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)
mod, params = relay.testing.mlp.get_workload(
batch_size=batch_size, image_shape=image_shape)
opt_level = 3
target = "llvm"
ctx = tvm.cpu()
with relay.build_config(opt_level=opt_level):
executor = relay.build_module.create_executor('graph', mod, ctx, target)
dtype = 'float32'
data = tvm.nd.array(np.random.uniform(-1, 1, size=data_shape).astype("float32"))
tvm_out = executor.evaluate()(data, **params)
out =tvm_out.asnumpy()
print(out.flatten()[0:10])
test_times = 100
import time
start = time.time()
for i in range(test_times):
executor.evaluate()(data, **params)
end = time.time()
print("Mean inference time", (end - start) / test_times * 1000)
#####################################################
import numpy as np
from tvm import relay
from tvm.relay import testing
import tvm
from tvm.contrib import graph_runtime
batch_size = 1
num_class = 10
image_shape=(1, 28, 28) # for mlp
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)
mod, params = relay.testing.mlp.get_workload(
batch_size=batch_size, image_shape=image_shape)
print(mod.astext(show_meta_data=False))
opt_level = 3
target = "llvm"
with relay.build_config(opt_level=opt_level):
graph, lib, params = relay.build_module.build(
mod, target, params=params)
ctx = tvm.cpu()
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
module = graph_runtime.create(graph, lib, ctx)
module.set_input("data", data)
module.set_input(**params)
module.run()
out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()
print(out.flatten()[0:10])
test_times = 100
import time
start = time.time()
for i in range(test_times):
module.run()
end = time.time()
print("Mean inference time", (end - start) / test_times * 1000)
@ wweic