Relay Graph executor performance regression

I’m seeing 20000X performance regression between the relay.build_module.build(mod, target, params=params) and relay.build_module.create_executor(‘graph’, mod, ctx, target).
Here is the script to reproduce in mlp instance:

import numpy as np

from tvm import relay
from tvm.relay import testing
import tvm

batch_size = 1
num_class = 10
image_shape=(1, 28, 28) # for mlp
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)

mod, params = relay.testing.mlp.get_workload(
    batch_size=batch_size, image_shape=image_shape)

opt_level = 3
target = "llvm"
ctx = tvm.cpu()
with relay.build_config(opt_level=opt_level):
    executor = relay.build_module.create_executor('graph', mod, ctx, target)

dtype = 'float32'
data = tvm.nd.array(np.random.uniform(-1, 1, size=data_shape).astype("float32"))
tvm_out = executor.evaluate()(data, **params)
out =tvm_out.asnumpy()
print(out.flatten()[0:10])
test_times = 100
import time

start = time.time()
for i in range(test_times):
    executor.evaluate()(data, **params)
end = time.time()
print("Mean inference time", (end - start) / test_times * 1000)

#####################################################

import numpy as np
from tvm import relay
from tvm.relay import testing
import tvm
from tvm.contrib import graph_runtime

batch_size = 1
num_class = 10

image_shape=(1, 28, 28) # for mlp
data_shape = (batch_size,) + image_shape
out_shape = (batch_size, num_class)

mod, params = relay.testing.mlp.get_workload(
    batch_size=batch_size, image_shape=image_shape)

print(mod.astext(show_meta_data=False))

opt_level = 3
target = "llvm"
with relay.build_config(opt_level=opt_level):
    graph, lib, params = relay.build_module.build(
        mod, target, params=params)

ctx = tvm.cpu()
data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
module = graph_runtime.create(graph, lib, ctx)
module.set_input("data", data)
module.set_input(**params)
module.run()
out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()

print(out.flatten()[0:10])


test_times = 100
import time

start = time.time()
for i in range(test_times):
    module.run()
end = time.time()
print("Mean inference time", (end - start) / test_times * 1000)

@ wweic

GraphExecutor is for debugging and testing purpose. Most of time would be spent here:

build is more preferable for performance measurement.

Thank for your reply.
we know that we set paramters and input data before module.run in graph runtime with relay.build_module.build .as flows :
module = graph_runtime.create(graph, lib, ctx)
module.set_input(“data”, data)
module.set_input(**params)

Can we do like this in the relay.build_module.create_executor(‘graph’, mod, ctx, target)?