@masahi We are doing on the fly quantization and saving the .so file with graph runtime and it is working fine. We are planning to do the same with vm flow. However, in vm flow, the lib is not updating after on the fly quantization. Am I missing anything while saving .so file?
mod = partition_for_vitis_ai(mod, params, target)
#print(mod.astext())
vai_build_dir = os.path.join(os.getcwd(), target + '_build')
vai_work_dir = os.path.join(os.getcwd(), target + '_work')
export_rt_mod_file = os.path.join(os.getcwd(), 'dpu.rtmod')
build_options = {
'dpu': target,
'build_dir': vai_build_dir,
'work_dir': vai_work_dir,
'export_runtime_module': export_rt_mod_file
}
with tvm.transform.PassContext(opt_level=3,
config={'relay.ext.vitis_ai.options': build_options}):
executable = rly_vm.compile(mod, target=tvm_target, params=params)
code, lib = executable.save()
lib.export_library(model + '_' + target + '.so')
with open(model + '_' + target + '.ro', "wb") as fo:
fo.write(code)
############################################################
## Create TVM InferenceSession
############################################################
print("Create InferenceSession")
loaded_lib = tvm.runtime.load_module(model + '_' + target + '.so')
loaded_code = bytearray(open(model + '_' + target + '.ro', "rb").read())
des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib)
InferenceSession = _vm.VirtualMachine(des_exec, tvm.cpu())
############################################################
## Quantization using first N inputs
##
## Usually, to be able to accelerate inference of Neural
## Network models with Vitis-AI DPU accelerators, those models
## need to quantized upfront. In the ONNXRuntime Vitis-AI
## execution provider we make use of on-the-fly quantization
## to remove this additional preprocessing step. In this flow,
## one doesn't need to quantize his/her model upfront but can
## make use of the typical inference execution calls
## (InferenceSession.run) to quantize the model on-the-fly
## using the first N inputs. This will set up and calibrate
## the Vitis-AI DPU and from that point onwards inference
## will be accelerated for all next inputs.
############################################################
#
## Set the number of inputs used for quantization to e.g. 8
## using the PX_QUANT_SIZE environment variable if you want
## to quantize on fewer inputs. The default is 128.
#
px_quant_size = int(os.environ['PX_QUANT_SIZE']) \
if 'PX_QUANT_SIZE' in os.environ else 128
for i in range(px_quant_size):
res=InferenceSession.run(**map_inputs)
lib.export_library(model + '_' + target + '.so')