With relay.build_config(opt_level=3): it gives the errors,but set opt_level=1,it works well

i am doing convert onnx model to tvm, i have tryed some values of opt_level,when i set it as follows:

with relay.build_config(opt_level=1):

it works well, i have tested it in android_deploy,but the latency is big.
so i set opt_level=3, but it gives some errors:

    Traceback (most recent call last):
File "cpd_mobilenetv3_deploy_on_android.py", line 63, in <module>
    graph, lib, params = relay.build(func, target=target, params=params)
File "/workspace/tvm_new/tvm/python/tvm/relay/build_module.py", line 356, in build
    params)
File "/workspace/tvm_new/tvm/python/tvm/relay/build_module.py", line 183, in build
    self._build(func, target, target_host)
File "/workspace/tvm_new/tvm/python/tvm/_ffi/_ctypes/function.py", line 209, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
[bt] (8) /workspace/tvm_new/tvm/build/libtvm.so(+0x62968d) [0x7f88fce0068d]
[bt] (7) /workspace/tvm_new/tvm/build/libtvm.so(+0x6ff3db) [0x7f88fced63db]
[bt] (6) /workspace/tvm_new/tvm/build/libtvm.so(tvm::relay::InferType(tvm::relay::Expr const&, tvm::relay::Module const&)+0x3fd) [0x7f88fced613d]
.........
.........
%298 = multiply(%37, %297)Incompatible broadcast type TensorType([1, 3, 44, 44, 8], float32) and TensorType([1, 0, 44, 44, 8], float32); 

my codes as following:

import nnvm
import os
import numpy as np
from PIL import Image
import tvm
import tvm.relay as relay
from tvm import rpc
from tvm.contrib import util, ndk, graph_runtime
import onnx
import logging
#logging.getLogger().setLevel(logging.DEBUG)

def preprocess_image(image_file):
    resized_image = Image.open(image_file).convert('RGB').resize((352, 352))
    image_data = np.asarray(resized_image)[np.newaxis,  :,  :,  :].astype(np.float32)
    x_1 = image_data
    x_2 = np.concatenate(( (x_1[:, :, :, 0]- 124.16)/58.624,(x_1[:, :, :,1]-116.736)/57.344 ), axis=0)
    x_1 = np.concatenate(( x_2, (x_1[:, :, :, 2]- 103.936)/57.6), axis=0)
    x = np.expand_dims(x_1,axis=0)
    return x 

if __name__ == "__main__":

    image_file = 'cat.png'
    image_data = preprocess_image(image_file)

    model_file = './models/mobilenetv3_small_cpd.onnx'
    onnx_model = onnx.load(model_file)
    from tvm import relay

    local_demo = False
    test_target = 'cpu'
    target = 'llvm -target=arm64-linux-android -mattr=+neon'
    target_host = None

    if local_demo:
        target_host = None
        target = 'llvm'
    elif test_target == 'opencl':
        target_host = target
        target = 'opencl'
    elif test_target == 'vulkan':
        target_host = target
        target = 'vulkan'

    input_name = '0'
    input_shape = (1, 3, 352, 352)
    shape_dict = {input_name: input_shape}
    func, params = relay.frontend.from_onnx(onnx_model, shape_dict)

    with relay.build_config(opt_level=3):
        graph, lib, params = relay.build(func, target=target, params=params)

    tmp = util.tempdir()
    lib_fname = tmp.relpath('net.so')
    fcompile = ndk.create_shared if not local_demo else None
    lib.export_library(lib_fname, fcompile)

    #save model to local
    if True:
        libpath = 'cpd.so'
        lib.export_library(libpath,fcompile)

        graph_json_path = 'cpd.json'
        with open(graph_json_path,'w') as fo:
            fo.write(graph)

        param_path='cpd.params'
        with open(param_path,'wb') as fo:
            fo.write(relay.save_param_dict(params))

    tracker_host = os.environ.get('TVM_TRACKER_HOST', '0.0.0.0')
    tracker_port = 9195
    key = 'android'

    if local_demo:
        remote = rpc.LocalSession()
    else:
        tracker = rpc.connect_tracker(tracker_host, tracker_port)
        # When running a heavy model, we should increase the `session_timeout`
        remote = tracker.request(key, priority=0,
                                session_timeout=60)

    if local_demo:
        ctx = remote.cpu(0)
    elif test_target == 'opencl':
        ctx = remote.cl(0)
    elif test_target == 'vulkan':
        ctx = remote.vulkan(0)
    else:
        ctx = remote.cpu(0)

    # upload the library to remote device and load it
    remote.upload(lib_fname)
    rlib = remote.load_module('net.so')

    module = graph_runtime.create(graph, rlib, ctx)

    module.set_input(**params)
    # set input data
    module.set_input(input_name, tvm.nd.array(image_data))
    # run
    module.run()
    # get output
    out = module.get_output(0)

    # get top1 result
    top1 = np.argmax(out.asnumpy())
    print('TVM prediction top-1: {}'.format(top1))

    print('Evaluate inference time cost...')
    ftimer = module.module.time_evaluator('run', ctx, number=1, repeat=10)
    prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
    print('Mean inference time (std dev): %.2f ms (%.2f ms)' % (np.mean(prof_res),
                                                                np.std(prof_res)))