As part of addressing the warning : DeprecationWarning: legacy graph runtime behaviour of producing json / lib / params will be removed in the next release
I’ve noticed that there seems to be a performance regression introduced as a result of using the new relay.build.
Given the following trivial example :
import os
import numpy as np
import tvm
from PIL import Image
from tvm import te
from tvm.contrib import graph_runtime
from tvm import relay
from tvm.runtime import container
from tvm.runtime import vm as vm_rt
from tvm.relay import testing
from tvm.relay import vm
from tvm.contrib.download import download_testdata
from util import load_test_image
model_dir ="./mnasnet_1.3_224/"
tflite_model_file = os.path.join(model_dir, "mnasnet_1.3_224.tflite")
tflite_model_buf = open(tflite_model_file, "rb").read()
# Get TFLite model from buffer
try:
import tflite
tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
except AttributeError:
import tflite.Model
tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
dtype="float32"
width=224
height=224
image_data = load_test_image(dtype, width, height)
input_tensor = "input"
input_shape = (1, 224, 224, 3)
input_dtype = "float32"
mod, params = relay.frontend.from_tflite(tflite_model,
shape_dict={input_tensor: input_shape},
dtype_dict={input_tensor: input_dtype})
target = "llvm -mattr=+neon"
tvm_targets = tvm.target.create(target)
cpu_target = "llvm"
target_host=cpu_target
cpudevice = tvm.runtime.cpu()
ctx = tvm.runtime.context("cpu")
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, target, params=params)
module = graph_runtime.create(graph, lib, tvm.cpu())
module.set_input(input_tensor, tvm.nd.array(image_data))
module.set_input(**params)
ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
prof_res = np.array(ftimer().results) * 1000 # multiply 1000 for converting to millisecond
print("%-20s %-19s (%s)" % ("mnasnet_1.3_224.tflite", "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
This yields ~1200ms with a 15ms std deviation on certain arm64 hardware.
If we just change the relay.build call etc just prior to the graph_runtime.create call to the following:
with tvm.transform.PassContext(opt_level=3):
graph_mod = relay.build(mod, tvm_targets, params=params,target_host=target_host)
lib = graph_mod.get_lib()
params = graph_mod.get_params()
graph = graph_mod.get_json()
module = graph_runtime.create(graph, lib, tvm.cpu())
time increases to ~3200ms with a 17ms std deviation.
Is PassContext properly constructed? Is this the right way to call relay.build?
If it is, then seems like some deeper digging is in order.
Thanks.