Hey,
I’ve been trying to get a minimal solution or implementation for a MNIST-Number CNN to work on a PYNQ VTA instance. As for understanding the design flow with VTA I took the Resnet-18 example and adapted / trained my own model to fit the MNIST number dataset. I can run Inference perfectly fine on an x86 cpu and with a little bit less accuracy through tvm/rpc on the ARM cores of the PYNQ. However, as soon as I use VTA all my predictions are fixed to 8 or 3 and it no longer predicts correctly.
def buildExecutionGraphVTA(env, remote, ctx, target, args): print(“Building Execution Graph”) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target):
# Populate the shape and data type dictionary for ImageNet classifier input
dtype_dict = {"data": "float32"}
shape_dict = {"data": (env.BATCH, 28, 28, 1)}
# Load own trained gluon model
gluon_model = vision.get_model(model, pretrained=False, classes=10)
f_path = "/home/xxx/yyy/models/"
model_f ="gluon_ownModelTrained_1_1_28_28"
name = "model"
f_name = f_path + model_f + "/" + name
#print("Before load: ", gluon_model.collect_params())
gluon_model.load_parameters(f_name)
#print("After load: ", gluon_model.collect_params())
# Measure build start time
build_start = time.time()
# Start front end compilation
mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
if target.device_name == "vta":
# Perform quantization in Relay
# Note: We set opt_level to 3 in order to fold batch norm
with tvm.transform.PassContext():
with relay.quantize.qconfig(calibrations_mode="global_scale", global_scale=8.0, skip_conv_layers=[]):
mod = relay.quantize.quantize(mod, params=params)
# Perform graph packing and constant folding for VTA target
assert env.BLOCK_IN == env.BLOCK_OUT
# do device annotation if target is intelfocl or sim
print(mod["main"])
relay_prog = graph_pack(
mod["main"],
env.BATCH,
env.BLOCK_OUT,
env.WGT_WIDTH,
start_name="nn.conv2d", #pack_dict[model][0], #
stop_name=pack_dict[model][1],
#device_annot=True,
)
else:
relay_prog = mod["main"]
#relay_prog = mod #test
# Compile Relay program with AlterOpLayout disabled
if target.device_name != "vta":
with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
graph, lib, params = relay.build(
relay_prog, target=tvm.target.Target(target, host="llvm"), params=params
)
else:
print(env.TARGET, env.target_host)
if env.TARGET == "intelfocl":
target = {"cpu": env.target_vta_cpu, "ext_dev": target}
onlyCPU = args.cpu and args.vta
print("VTA ONLY CPU: ", onlyCPU)
print(type(target), target)
# multiple targets to run both on cpu and vta
if onlyCPU:
target = {"cpu": env.target_vta_cpu}
with vta.build_config(
opt_level=1, disabled_pass={"AlterOpLayout"} #, "tir.CommonSubexprElimTIR"}
):
graph, lib2, params = relay.build(
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
)
# Measure Relay build time
build_time = time.time() - build_start
print(model + " inference graph built in {0:.2f}s!".format(build_time))
# Send the inference library over to the remote RPC server
temp = utils.tempdir()
lib2.export_library(temp.relpath("graphlib.tar"))
remote.upload(temp.relpath("graphlib.tar"))
lib = remote.load_module("graphlib.tar")
if env.TARGET == "intelfocl":
ctxes = [remote.ext_dev(0), remote.cpu(0)]
m = graph_executor.create(graph, lib, ctxes)
else:
# Graph runtime
m = graph_executor.create(graph, lib, ctx)
return m, params, graph, lib2
def runImageInferenceVTA(env, remote, ctx, m, params, test_loader, folder_path): print(“Run Image Inference VTA”) # Load image for eval for batch_idx, (features, targets) in enumerate(test_loader): for idx, (image, label) in enumerate(zip(features, targets)):
image = image.numpy()
image = np.expand_dims(image, axis=3) # Shape: (1, 28, 28, 1)
image = np.tile(image, (1, 1, 1, 1)) # Ensure correct batch size
# Set the network parameters and inputs
m.set_input(**params)
m.set_input("data", image)
# Perform inference and gather execution statistics
# More on: :py:method:`tvm.runtime.Module.time_evaluator`
num = 1 # number of times we run module for a single measurement
rep = 1 # number of measurements (we derive std dev from this)
#timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
m.run()
# Get classification results
tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 10), "float32", remote.cpu(0)))
for b in range(env.BATCH):
#print(len(tvm_output.numpy()[0]))
#print(tvm_output.numpy()[b])
pred = [output.argmax(axis=1).astype("int") for output in [tvm_output.asnumpy()]][0].tolist()
print("Prediction (PRED,LABEL,EQUAL): ", pred[0], label.item(), pred[0]==label.item())
I narrowed down the error to the actually execution of “graph_pack”. After running it the predictions are off.
The only constraint I have that I need the first convolution to also happen within VTA.
I’d be glad if someone could hint me to either a minimal solution that wouldn’t require Resnet18 or an idea where I am going wrong with graph pack or my overall implementation.
Thanks for any help.
Kind regards