Metaschedule with hexagon stuck on N/A latency

Hi,

I’m having trouble tuning using Metaschedule on Hexagon DSP. I can only get “N/A” for flops and latency:


2023-11-29 19:33:33 [INFO] Logging directory: /tmp/tmp42v6p96r/logs
2023-11-29 19:33:40 [INFO] [task_scheduler.cc:160] Initializing Task #0: fused_nn_conv2d_add
2023-11-29 19:33:41 [INFO] [task_scheduler.cc:321]
 ID |                Name |      FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done
-----------------------------------------------------------------------------------------------------------------------
  0 | fused_nn_conv2d_add | 231411712 |      1 |            N/A |          N/A |                   N/A |      0 |
-----------------------------------------------------------------------------------------------------------------------
Total trials: 0
Total latency (us): 0

As you can see, latency and flops are stuck on “N/A”. I’ve verified that my setup with the actual hardware is correct as I have no trouble running TEs on device without tuning.

I’ve used the following python code, apologies for it’s length.

import os
import pathlib
import sys
import numpy as np
import logging
import onnx
from PIL import Image
import tempfile

from tvm import meta_schedule as ms
from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner

import tvm.testing
from tvm.tir.transform import StorageRewrite
from tvm import te, transform, auto_scheduler

from tvm.relay.backend import Executor, Runtime
import tvm.relay as relay
from tvm.contrib import utils, ndk
from tvm.contrib import graph_executor

from tvm.contrib.hexagon.build import HexagonLauncher
from tvm.contrib.download import download_testdata
from tvm.relay import data_dep_optimization as ddo

RPC_SERVER_PORT = 7070

def get_hexagon_target(cpu_ver: str, **kwargs) -> tvm.target.Target:
    """Creates a Hexagon target"""
    target = tvm.target.hexagon(cpu_ver, **kwargs)
    return tvm.target.Target(target, host=target)

def build_launcher(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket):
    rpc_info = {
        "rpc_tracker_host": tvm_tracker_host,
        "rpc_tracker_port": tvm_tracker_port,
        "rpc_server_port": RPC_SERVER_PORT + 0,  
        "adb_server_socket": adb_server_socket,
    }
    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
    return launcher


def convert_conv2d_layout(mod, desired_layouts):
    with tvm.transform.PassContext(opt_level=3):
        seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
        return seq(mod)


def test_conv2d(hexagon_launcher):
    """Test conv2d using auto schedule."""

    i_size, o_size, h_size, w_size = 64, 64, 56, 56
    k_height_size = k_width_size = 3

    strides = (1, 1)
    padding = (1, 1)

    d_shape = (1, h_size, w_size, i_size)
    w_shape = (k_height_size, k_width_size, i_size, o_size)
    bias_shape = (1, 1, 1, w_shape[3])
    out_channel = w_shape[3]

    data = relay.var("data", shape=d_shape, dtype="float16")
    weight = relay.var("weight", shape=w_shape, dtype="float16")
    bias = relay.var("bias", shape=bias_shape, dtype="float16")
    conv2d = relay.nn.conv2d(
        data=data,
        weight=weight,
        kernel_size=(k_height_size, k_width_size),
        channels=out_channel,
        padding=padding,
        strides=strides,
        out_dtype="float16",
        data_layout="NHWC",
        kernel_layout="HWIO",
    )
    mod = tvm.IRModule.from_expr(conv2d + bias)
    mod = mod.with_attr("executor", relay.backend.Executor("graph", {"link-params": True}))

    data_np = np.random.randn(*d_shape).astype("float16")
    weight_np = np.random.randn(*w_shape).astype("float16")
    bias_np = np.random.randn(*bias_shape).astype("float16")
    params = {"weight": weight_np, "bias": bias_np}

    ref = (
        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
        .evaluate()(*[data_np, weight_np, bias_np])
        .numpy()
    )

    with tempfile.TemporaryDirectory() as work_dir:
        target = get_hexagon_target("v68")
        database = ms.relay_integration.tune_relay(
            mod=mod,
            params=params,
            target=target,
            max_trials_global=8,
            strategy="replay-trace",
            work_dir=workdir,
            builder=get_hexagon_local_builder(),
            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
        )
        lib = ms.relay_integration.compile_relay(
            database=database,
            mod=mod,
            params=params,
            target=target,
        )

    with hexagon_launcher.create_session() as session:
        rt_mod = session.get_executor_from_factory(lib)

        rt_mod.set_input("data", data_np)

        rt_mod.run()

        out = rt_mod.get_output(0).numpy()
        # Fairly loose check since fp16 results between x86 and Hexagon have
        # non-trivial difference.
        assert np.mean(np.abs(ref - out)) < 0.5

if __name__=="__main__":
    print("Start demo")
    android_serial = sys.argv[1]
    tvm_tracker_host = sys.argv[2]
    tvm_tracker_port = int(sys.argv[3])
    adb_server_socket = sys.argv[4]
    print(f"android_serial: {android_serial}, tvm_host: {tvm_tracker_host}, tvm_port: {tvm_tracker_port}, adb_socket: {adb_server_socket}")
    launcher = build_launcher(android_serial, tvm_tracker_host, tvm_tracker_port, adb_server_socket)
    test_conv2d(launcher)

I don’t know what went wrong and not sure where to start debugging it.

Thank you