I try auto scheduler in target of metal, but crashed with the following error:
> Time elapsed for measurement: 6.01 s
> ----------------------------------------------------------------------
> ------------------------------ [ Train cost model ]
> ----------------------------------------------------------------------
> Time elapsed for training: 1.03 s
> Compile...
> Traceback (most recent call last):
> File "./tune_network_local_opencl.py", line 348, in <module>
> tune_and_evaluate()
> File "./tune_network_local_opencl.py", line 323, in tune_and_evaluate
> lib = relay.build(mod, target=target, params=params)
> File "/Users/banma-1396/proj/tvm/tvm/python/tvm/relay/build_module.py", line 290, in build
> graph_json, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
> File "/Users/banma-1396/proj/tvm/tvm/python/tvm/relay/build_module.py", line 136, in build
> self._build(mod, target, target_host)
> File "/Users/banma-1396/proj/tvm/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
> raise get_last_ffi_error()
> tvm._ffi.base.TVMError: Traceback (most recent call last):
> [bt] (8) 9 libtvm.dylib 0x0000000145969ca8 tvm::SplitDevHostFuncs(tvm::IRModule, tvm::Target const&, tvm::Target const&, tvm::transform::PassContext const&) + 2376
> [bt] (7) 8 libtvm.dylib 0x00000001459d06dc tvm::transform::Pass::operator()(tvm::IRModule) const + 316
> [bt] (6) 7 libtvm.dylib 0x00000001459d08bc tvm::transform::PassNode::operator()(tvm::IRModule) const + 156
> [bt] (5) 6 libtvm.dylib 0x00000001459d3505 tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const + 885
> [bt] (4) 5 libtvm.dylib 0x00000001459d0b18 tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const + 328
> [bt] (3) 4 libtvm.dylib 0x00000001459d1f2f tvm::transform::ModulePassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const + 815
> [bt] (2) 3 libtvm.dylib 0x0000000145c55a62 std::__1::__function::__func<void tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::$_0>(tvm::tir::transform::VerifyMemory()::$_0)::'lambda'(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*), std::__1::allocator<void tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::$_0>(tvm::tir::transform::VerifyMemory()::$_0)::'lambda'(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)>, void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&) + 1666
> [bt] (1) 2 libtvm.dylib 0x00000001456ff959 tvm::runtime::detail::LogFatal::Entry::Finalize() + 89
> [bt] (0) 1 libtvm.dylib 0x000000014682eee8 tvm::runtime::Backtrace() + 24
> Did you forget to bind?
> Variable `T_relu` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
> Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
> Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
> Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
> File "/Users/banma-1396/proj/tvm/tvm/src/tir/analysis/verify_memory.cc", line 202
> RuntimeError: Memory verification failed with the following errors:
> PrimFunc([placeholder, placeholder, placeholder, T_relu]) attrs={"global_symbol": "fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_6", "tir.noalias": (bool)1, "target": metal -keys=metal,gpu -max_num_threads=256} {
> // attr [data_pad] storage_scope = "global"
> allocate data_pad[float32 * 200704]
> // attr [input_tile] storage_scope = "global"
> allocate input_tile[float32 * 200704]
> // attr [B] storage_scope = "global"
> allocate B[float32 * 16]
> for (i1, 0, 16) {
> for (i2, 0, 16) {
> for (i3, 0, 256) {
> data_pad[(((i1*4096) + (i2*256)) + i3)] = tir.if_then_else(((((1 <= i1) && (i1 < 15)) && (1 <= i2)) && (i2 < 15)), placeholder[((((i1*3584) + (i2*256)) + i3) - 3840)], 0f)
> }
> }
> }
> for (eps, 0, 4) {
> for (nu, 0, 4) {
> for (p, 0, 49) {
> for (ci, 0, 256) {
> input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = data_pad[(((((floordiv(p, 7)*8192) + (eps*4096)) + (floormod(p, 7)*512)) + (nu*256)) + ci)]
> }
> }
> }
> }
> for (i, 0, 4) {
> for (j, 0, 4) {
> B[((i*4) + j)] = select(((i == 3) && (j == 3)), 1f, select(((i == 3) && (j == 2)), 0f, select(((i == 3) && (j == 1)), 0f, select(((i == 3) && (j == 0)), 0f, select(((i == 2) && (j == 3)), 0f, select(((i == 2) && (j == 2)), 1f, select(((i == 2) && (j == 1)), 1f, select(((i == 2) && (j == 0)), -1f, select(((i == 1) && (j == 3)), -1f, select(((i == 1) && (j == 2)), 1f, select(((i == 1) && (j == 1)), -1f, select(((i == 1) && (j == 0)), 0f, select(((i == 0) && (j == 3)), 0f, select(((i == 0) && (j == 2)), 0f, select(((i == 0) && (j == 1)), 0f, select(((i == 0) && (j == 0)), 1f, 0f))))))))))))))))
> }
> }
> for (eps, 0, 4) {
> for (nu, 0, 4) {
> for (p, 0, 49) {
> for (ci, 0, 256) {
> data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = 0f
> for (r_a, 0, 4) {
> for (r_b, 0, 4) {
> data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = (data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] + ((input_tile[((((r_a*50176) + (r_b*12544)) + (p*256)) + ci)]*B[((r_a*4) + eps)])*B[((r_b*4) + nu)]))
> }
> }
> }
> }
> }
> }
> for (eps, 0, 4) {
> for (nu, 0, 4) {
> for (p, 0, 49) {
> for (co, 0, 256) {
> input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] = 0f
> for (ci, 0, 256) {
> input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] = (input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] + (data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)]*placeholder[((((eps*262144) + (nu*65536)) + (co*256)) + ci)]))
> }
> }
> }
> }
> }
> for (i, 0, 4) {
> for (j, 0, 2) {
> B[((i*2) + j)] = select(((i == 3) && (j == 1)), 1f, select(((i == 3) && (j == 0)), 0f, select(((i == 2) && (j == 1)), 1f, select(((i == 2) && (j == 0)), 1f, select(((i == 1) && (j == 1)), -1f, select(((i == 1) && (j == 0)), 1f, select(((i == 0) && (j == 1)), 0f, select(((i == 0) && (j == 0)), 1f, 0f))))))))
> }
> }
> for (vh, 0, 2) {
> for (vw, 0, 2) {
> for (p, 0, 49) {
> for (co, 0, 256) {
> data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] = 0f
> for (r_a, 0, 4) {
> for (r_b, 0, 4) {
> data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] = (data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] + ((input_tile[((((r_a*50176) + (r_b*12544)) + (p*256)) + co)]*B[((r_a*2) + vh)])*B[((r_b*2) + vw)]))
> }
> }
> }
> }
> }
> }
> for (h, 0, 14) {
> for (w, 0, 14) {
> for (co, 0, 256) {
> input_tile[(((h*3584) + (w*256)) + co)] = data_pad[(((((floormod(h, 2)*25088) + (floormod(w, 2)*12544)) + (floordiv(h, 2)*1792)) + (floordiv(w, 2)*256)) + co)]
> }
> }
> }
> for (ax1, 0, 14) {
> for (ax2, 0, 14) {
> for (ax3, 0, 256) {
> data_pad[(((ax1*3584) + (ax2*256)) + ax3)] = (input_tile[(((ax1*3584) + (ax2*256)) + ax3)] + placeholder[ax3])
> }
> }
> }
> for (ax1, 0, 14) {
> for (ax2, 0, 14) {
> for (ax3, 0, 256) {
> T_relu[(((ax1*3584) + (ax2*256)) + ax3)] = max(data_pad[(((ax1*3584) + (ax2*256)) + ax3)], 0f)
> }
> }
> }
> }
Here is my script:
import numpy as np import os import tvm from tvm import relay, auto_scheduler from tvm.relay import data_dep_optimization as ddo import tvm.relay.testing from tvm.contrib import graph_executor import tvm.contrib.graph_runtime as runtime from tvm.contrib.utils import tempdir from tvm.contrib import cc, ndk def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=False): """Get the symbol definition and random weight of a network""" # auto-scheduler prefers NHWC layout if layout == "NHWC": image_shape = (224, 224, 3) elif layout == "NCHW": image_shape = (3, 224, 224) else: raise ValueError("Invalid layout: " + layout) input_shape = (batch_size,) + image_shape output_shape = (batch_size, 1000) if name.startswith("resnet-"): n_layer = int(name.split("-")[1]) mod, params = relay.testing.resnet.get_workload( num_layers=n_layer, batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape, ) elif name.startswith("resnet3d-"): n_layer = int(name.split("-")[1]) mod, params = relay.testing.resnet.get_workload( num_layers=n_layer, batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape, ) elif name == "mobilenet": mod, params = relay.testing.mobilenet.get_workload( batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape ) elif name == "squeezenet_v1.1": assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" mod, params = relay.testing.squeezenet.get_workload( version="1.1", batch_size=batch_size, dtype=dtype, image_shape=image_shape, ) elif name == "inception_v3": input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) elif name == "mxnet": # an example for mxnet model from mxnet.gluon.model_zoo.vision import get_model assert layout == "NCHW" block = get_model("resnet50_v1", pretrained=True) mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) net = mod["main"] net = relay.Function( net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs ) mod = tvm.IRModule.from_expr(net) elif name == "mlp": mod, params = relay.testing.mlp.get_workload( batch_size=batch_size, dtype=dtype, image_shape=image_shape, num_classes=1000 ) else: raise ValueError("Network not found.") if use_sparse: from tvm.topi.sparse.utils import convert_model_dense_to_sparse mod, params = convert_model_dense_to_sparse(mod, params, random_params=True) return mod, params, input_shape, output_shape target = tvm.target.Target(target="metal", host="llvm") use_ndk = False #### TUNING OPTION #### network = "resnet-18" use_sparse = False batch_size = 1 layout = "NHWC" dtype = "float32" log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) # Extract tasks from the network print("Get model...") mod, params, input_shape, output_shape = get_network( network, batch_size, layout, dtype=dtype, use_sparse=use_sparse ) print("Extract tasks...") tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) for idx, task in enumerate(tasks): print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) print(task.compute_dag) def tune_and_evaluate(): print("Begin tuning...") tuner = auto_scheduler.TaskScheduler(tasks, task_weights) tune_option = auto_scheduler.TuningOptions( num_measure_trials=50, # change this to 20000 to achieve the best performance builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"), runner = auto_scheduler.LocalRunner( number=10, repeat=1, min_repeat_ms=200, timeout=30, enable_cpu_cache_flush=True), measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) tuner.tune(tune_option) # Compile with the history best print("Compile...") with auto_scheduler.ApplyHistoryBest(log_file): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_auto_scheduler": True} ): lib = relay.build(mod, target=target, params=params) # Create graph executor #dev = remote.cpu() # dev = tvm.context(str(target), 0) dev = tvm.metal(0) module = graph_executor.GraphModule(lib["default"](dev)) #module = runtime.GraphModule(rlib["default"](dev)) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input("data", data_tvm) # Evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) ) tune_and_evaluate()
Could anyone help me?