Auto scheduler crash in metal device

I try auto scheduler in target of metal, but crashed with the following error:

> Time elapsed for measurement: 6.01 s
> ----------------------------------------------------------------------
> ------------------------------  [ Train cost model ]
> ----------------------------------------------------------------------
> Time elapsed for training: 1.03 s
> Compile...
> Traceback (most recent call last):
>   File "./tune_network_local_opencl.py", line 348, in <module>
>     tune_and_evaluate()
>   File "./tune_network_local_opencl.py", line 323, in tune_and_evaluate
>     lib = relay.build(mod, target=target, params=params)
>   File "/Users/banma-1396/proj/tvm/tvm/python/tvm/relay/build_module.py", line 290, in build
>     graph_json, runtime_mod, params = bld_mod.build(mod=ir_mod, target=target, params=params)
>   File "/Users/banma-1396/proj/tvm/tvm/python/tvm/relay/build_module.py", line 136, in build
>     self._build(mod, target, target_host)
>   File "/Users/banma-1396/proj/tvm/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
>     raise get_last_ffi_error()
> tvm._ffi.base.TVMError: Traceback (most recent call last):
>   [bt] (8) 9   libtvm.dylib                        0x0000000145969ca8 tvm::SplitDevHostFuncs(tvm::IRModule, tvm::Target const&, tvm::Target const&, tvm::transform::PassContext const&) + 2376
>   [bt] (7) 8   libtvm.dylib                        0x00000001459d06dc tvm::transform::Pass::operator()(tvm::IRModule) const + 316
>   [bt] (6) 7   libtvm.dylib                        0x00000001459d08bc tvm::transform::PassNode::operator()(tvm::IRModule) const + 156
>   [bt] (5) 6   libtvm.dylib                        0x00000001459d3505 tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const + 885
>   [bt] (4) 5   libtvm.dylib                        0x00000001459d0b18 tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const + 328
>   [bt] (3) 4   libtvm.dylib                        0x00000001459d1f2f tvm::transform::ModulePassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const + 815
>   [bt] (2) 3   libtvm.dylib                        0x0000000145c55a62 std::__1::__function::__func<void tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::$_0>(tvm::tir::transform::VerifyMemory()::$_0)::'lambda'(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*), std::__1::allocator<void tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::$_0>(tvm::tir::transform::VerifyMemory()::$_0)::'lambda'(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)>, void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&) + 1666
>   [bt] (1) 2   libtvm.dylib                        0x00000001456ff959 tvm::runtime::detail::LogFatal::Entry::Finalize() + 89
>   [bt] (0) 1   libtvm.dylib                        0x000000014682eee8 tvm::runtime::Backtrace() + 24
>   Did you forget to bind?
>     Variable `T_relu` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
>     Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
>     Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
>     Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
>   File "/Users/banma-1396/proj/tvm/tvm/src/tir/analysis/verify_memory.cc", line 202
> RuntimeError: Memory verification failed with the following errors:
> PrimFunc([placeholder, placeholder, placeholder, T_relu]) attrs={"global_symbol": "fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_6", "tir.noalias": (bool)1, "target": metal -keys=metal,gpu -max_num_threads=256} {
>   // attr [data_pad] storage_scope = "global"
>   allocate data_pad[float32 * 200704]
>   // attr [input_tile] storage_scope = "global"
>   allocate input_tile[float32 * 200704]
>   // attr [B] storage_scope = "global"
>   allocate B[float32 * 16]
>   for (i1, 0, 16) {
>     for (i2, 0, 16) {
>       for (i3, 0, 256) {
>         data_pad[(((i1*4096) + (i2*256)) + i3)] = tir.if_then_else(((((1 <= i1) && (i1 < 15)) && (1 <= i2)) && (i2 < 15)), placeholder[((((i1*3584) + (i2*256)) + i3) - 3840)], 0f)
>       }
>     }
>   }
>   for (eps, 0, 4) {
>     for (nu, 0, 4) {
>       for (p, 0, 49) {
>         for (ci, 0, 256) {
>           input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = data_pad[(((((floordiv(p, 7)*8192) + (eps*4096)) + (floormod(p, 7)*512)) + (nu*256)) + ci)]
>         }
>       }
>     }
>   }
>   for (i, 0, 4) {
>     for (j, 0, 4) {
>       B[((i*4) + j)] = select(((i == 3) && (j == 3)), 1f, select(((i == 3) && (j == 2)), 0f, select(((i == 3) && (j == 1)), 0f, select(((i == 3) && (j == 0)), 0f, select(((i == 2) && (j == 3)), 0f, select(((i == 2) && (j == 2)), 1f, select(((i == 2) && (j == 1)), 1f, select(((i == 2) && (j == 0)), -1f, select(((i == 1) && (j == 3)), -1f, select(((i == 1) && (j == 2)), 1f, select(((i == 1) && (j == 1)), -1f, select(((i == 1) && (j == 0)), 0f, select(((i == 0) && (j == 3)), 0f, select(((i == 0) && (j == 2)), 0f, select(((i == 0) && (j == 1)), 0f, select(((i == 0) && (j == 0)), 1f, 0f))))))))))))))))
>     }
>   }
>   for (eps, 0, 4) {
>     for (nu, 0, 4) {
>       for (p, 0, 49) {
>         for (ci, 0, 256) {
>           data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = 0f
>           for (r_a, 0, 4) {
>             for (r_b, 0, 4) {
>               data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = (data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] + ((input_tile[((((r_a*50176) + (r_b*12544)) + (p*256)) + ci)]*B[((r_a*4) + eps)])*B[((r_b*4) + nu)]))
>             }
>           }
>         }
>       }
>     }
>   }
>   for (eps, 0, 4) {
>     for (nu, 0, 4) {
>       for (p, 0, 49) {
>         for (co, 0, 256) {
>           input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] = 0f
>           for (ci, 0, 256) {
>             input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] = (input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] + (data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)]*placeholder[((((eps*262144) + (nu*65536)) + (co*256)) + ci)]))
>           }
>         }
>       }
>     }
>   }
>   for (i, 0, 4) {
>     for (j, 0, 2) {
>       B[((i*2) + j)] = select(((i == 3) && (j == 1)), 1f, select(((i == 3) && (j == 0)), 0f, select(((i == 2) && (j == 1)), 1f, select(((i == 2) && (j == 0)), 1f, select(((i == 1) && (j == 1)), -1f, select(((i == 1) && (j == 0)), 1f, select(((i == 0) && (j == 1)), 0f, select(((i == 0) && (j == 0)), 1f, 0f))))))))
>     }
>   }
>   for (vh, 0, 2) {
>     for (vw, 0, 2) {
>       for (p, 0, 49) {
>         for (co, 0, 256) {
>           data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] = 0f
>           for (r_a, 0, 4) {
>             for (r_b, 0, 4) {
>               data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] = (data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] + ((input_tile[((((r_a*50176) + (r_b*12544)) + (p*256)) + co)]*B[((r_a*2) + vh)])*B[((r_b*2) + vw)]))
>             }
>           }
>         }
>       }
>     }
>   }
>   for (h, 0, 14) {
>     for (w, 0, 14) {
>       for (co, 0, 256) {
>         input_tile[(((h*3584) + (w*256)) + co)] = data_pad[(((((floormod(h, 2)*25088) + (floormod(w, 2)*12544)) + (floordiv(h, 2)*1792)) + (floordiv(w, 2)*256)) + co)]
>       }
>     }
>   }
>   for (ax1, 0, 14) {
>     for (ax2, 0, 14) {
>       for (ax3, 0, 256) {
>         data_pad[(((ax1*3584) + (ax2*256)) + ax3)] = (input_tile[(((ax1*3584) + (ax2*256)) + ax3)] + placeholder[ax3])
>       }
>     }
>   }
>   for (ax1, 0, 14) {
>     for (ax2, 0, 14) {
>       for (ax3, 0, 256) {
>         T_relu[(((ax1*3584) + (ax2*256)) + ax3)] = max(data_pad[(((ax1*3584) + (ax2*256)) + ax3)], 0f)
>       }
>     }
>   }
> }

Here is my script:

import numpy as np
import os

import tvm
from tvm import relay, auto_scheduler
from tvm.relay import data_dep_optimization as ddo
import tvm.relay.testing
from tvm.contrib import graph_executor
import tvm.contrib.graph_runtime as runtime
from tvm.contrib.utils import tempdir
from tvm.contrib import cc, ndk


def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=False):
    """Get the symbol definition and random weight of a network"""

    # auto-scheduler prefers NHWC layout
    if layout == "NHWC":
        image_shape = (224, 224, 3)
    elif layout == "NCHW":
        image_shape = (3, 224, 224)
    else:
        raise ValueError("Invalid layout: " + layout)

    input_shape = (batch_size,) + image_shape
    output_shape = (batch_size, 1000)

    if name.startswith("resnet-"):
        n_layer = int(name.split("-")[1])
        mod, params = relay.testing.resnet.get_workload(
            num_layers=n_layer,
            batch_size=batch_size,
            layout=layout,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name.startswith("resnet3d-"):
        n_layer = int(name.split("-")[1])
        mod, params = relay.testing.resnet.get_workload(
            num_layers=n_layer,
            batch_size=batch_size,
            layout=layout,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name == "mobilenet":
        mod, params = relay.testing.mobilenet.get_workload(
            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
        )
    elif name == "squeezenet_v1.1":
        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
        mod, params = relay.testing.squeezenet.get_workload(
            version="1.1",
            batch_size=batch_size,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name == "inception_v3":
        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
    elif name == "mxnet":
        # an example for mxnet model
        from mxnet.gluon.model_zoo.vision import get_model

        assert layout == "NCHW"

        block = get_model("resnet50_v1", pretrained=True)
        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
        net = mod["main"]
        net = relay.Function(
            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
        )
        mod = tvm.IRModule.from_expr(net)
    elif name == "mlp":
        mod, params = relay.testing.mlp.get_workload(
            batch_size=batch_size, dtype=dtype, image_shape=image_shape, num_classes=1000
        )
    else:
        raise ValueError("Network not found.")

    if use_sparse:
        from tvm.topi.sparse.utils import convert_model_dense_to_sparse

        mod, params = convert_model_dense_to_sparse(mod, params, random_params=True)

    return mod, params, input_shape, output_shape


target = tvm.target.Target(target="metal", host="llvm")

use_ndk = False 

#### TUNING OPTION ####
network = "resnet-18"
use_sparse = False
batch_size = 1
layout = "NHWC"
dtype = "float32"
log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)


# Extract tasks from the network
print("Get model...")
mod, params, input_shape, output_shape = get_network(
    network, batch_size, layout, dtype=dtype, use_sparse=use_sparse
)
print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    print(task.compute_dag)


def tune_and_evaluate():
    print("Begin tuning...")
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=50,  # change this to 20000 to achieve the best performance
        builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"),
        runner = auto_scheduler.LocalRunner(
            number=10, repeat=1, min_repeat_ms=200, timeout=30, enable_cpu_cache_flush=True), 
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)

    # Compile with the history best
    print("Compile...")
    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(
            opt_level=3, config={"relay.backend.use_auto_scheduler": True}
        ):
            lib = relay.build(mod, target=target, params=params)


    # Create graph executor
    #dev = remote.cpu()
    # dev = tvm.context(str(target), 0)
    dev = tvm.metal(0)
    module = graph_executor.GraphModule(lib["default"](dev))
    #module = runtime.GraphModule(rlib["default"](dev))
    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
    module.set_input("data", data_tvm)

    # Evaluate
    print("Evaluate inference time cost...")
    ftimer = module.module.time_evaluator("run", dev, repeat=3, min_repeat_ms=500)
    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
    print(
        "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))
    )


tune_and_evaluate()

Could anyone help me?