[Auto-TVM] How to Auto tune the model on iOS device

jacobpostman · September 1, 2020, 8:03am

Hi @kazum - Thank you for the previous suggestions, I am also looking at how to use autotvm to tune a model on iOS.

Below is a modified version of ‘tutorials/autotvm/tune_relay_arm.py’ that is based on your previous comment suggestion of adding a build_func, but something isn’t working quite right yet.

Tuning tasks are stuck at 0 GFLOPS and the tuning trials time out.

[Task 1/12] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/100) | 0.00 s

If I skip tuning (remove ‘#tune_tasks(tasks, **tuning_opt’), it successfully builds and runs the untuned model and reports an inference result.

Any idea what step might be missing here?

Thank you!

Assumption: you have a single macOS based host running the rpc proxy, tracker and xcode, with local network IP: 192.168.0.10

Setup environment variables:

  export TVM_IOS_CODESIGN='Apple Development: <your@email.com> (<SIGNINGCODE>)'
  export TVM_IOS_RPC_ROOT=${TVM_HOME}/apps/ios_rpc
  export TVM_IOS_RPC_PROXY_HOST=192.168.0.10
  #export TVM_IOS_RPC_DESTINATION='platform=iOS Simulator,id=<simulator id>'
  export TVM_IOS_RPC_DESTINATION='platform=iOS,id=<ios device id>'

Start the tracker

 python3 -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 --no-fork
 INFO:RPCTracker:bind to 0.0.0.0:9190

Start the rpc proxy and point it to the tracker

 python3 -m tvm.exec.rpc_proxy --host 0.0.0.0 --tracker 0.0.0.0:9190 --no-fork
 INFO:root:RPCProxy: client port bind to 0.0.0.0:9090

Run tuning:

 cd ${TVM_HOME}/apps/ios_rpc
 python3 tests/tune_relay_ios.py

Code:

"""
apps/ios_rpc/tests/tune_relay_ios.py

Auto-tuning a convolutional network for iPhone CPU
===============================================

"""

import os
import numpy as np
import tvm
from tvm import te
from tvm import autotvm
from tvm import relay
import tvm.relay.testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir
import tvm.contrib.graph_runtime as runtime
from tvm.contrib import xcode

#################################################################
# Define network
# --------------
# First we need to define the network in relay frontend API.
# We can load some pre-defined network from :code:`relay.testing`.
# We can also load models from MXNet, ONNX and TensorFlow.

def get_network(name, batch_size):
    """Get the symbol definition and random weight of a network"""
    input_shape = (batch_size, 3, 224, 224)
    output_shape = (batch_size, 1000)

    if "resnet" in name:
        n_layer = int(name.split('-')[1])
        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
    elif "vgg" in name:
        n_layer = int(name.split('-')[1])
        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
    elif name == 'mobilenet':
        mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
    elif name == 'squeezenet_v1.1':
        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
    elif name == 'inception_v3':
        input_shape = (1, 3, 299, 299)
        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
    elif name == 'mxnet':
        # an example for mxnet model
        from mxnet.gluon.model_zoo.vision import get_model
        block = get_model('resnet18_v1', pretrained=True)
        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
        net = mod["main"]
        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
        mod = tvm.IRModule.from_expr(net)
    else:
        raise ValueError("Unsupported network: " + name)

    return mod, params, input_shape, output_shape

#################################################################
# Start RPC Tracker
# ------------------
# python3 -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 --no-fork
#
#  - Autotvm will use the tracker to orchestrate tuning test runs.
# 
# Start RPC Proxy
# python3 -m tvm.exec.rpc_proxy --host 0.0.0.0 --tracker 0.0.0.0:9190 --no-fork


###########################################
# Set Tuning Options
# ------------------

#### DEVICE CONFIG ####

# Set to be address of tvm proxy.
proxy_host = os.environ["TVM_IOS_RPC_PROXY_HOST"]
# Set your desination via env variable.

# Should in format "platform=iOS,id=<the test device uuid>"
destination = os.environ["TVM_IOS_RPC_DESTINATION"]

device_key = 'iphone'
proxy_port = 9090

arch = "arm64"
sdk = "iphoneos"
target = "llvm -mtriple=%s-apple-darwin" % arch
target_host = "llvm -mtriple=%s-apple-darwin" % arch

#### TUNING OPTION ####
network = 'resnet-18'
log_file = "%s.%s.log" % (device_key, network)
dtype = 'float32'

autotvm.measure.measure_methods.check_remote = lambda *args: True

def fcompile(*args):
    xcode.create_dylib(*args, arch=arch, sdk=sdk)
    path = args[0]
    xcode.codesign(path)
    xcode.popen_test_rpc(proxy_host, proxy_port, device_key, destination=destination, libs=[path])

fcompile.output_format = "dylib"

tuning_option = {
    'log_filename': log_file,
    'tuner': 'random',
    'early_stopping': None,
    'n_trial': 100,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(
            n_parallel=1,
            build_func=fcompile,
            timeout=60
        ),
        runner=autotvm.RPCRunner(
            device_key, host='127.0.0.1', port=9190,
            number=20, repeat=3, timeout=60, min_repeat_ms=150)
    ),
}

###################################################################
# Begin Tuning
# ------------

def tune_tasks(tasks,
               measure_option,
               tuner='random',
               n_trial=1000,
               early_stopping=None,
               log_filename='tuning.log',
               use_transfer_learning=True):
    # create tmp log file
    tmp_log_file = log_filename + ".tmp"
    if os.path.exists(tmp_log_file):
        os.remove(tmp_log_file)

    for i, tsk in enumerate(reversed(tasks)):
        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))

        # create tuner
        if tuner == 'xgb' or tuner == 'xgb-rank':
            tuner_obj = XGBTuner(tsk, loss_type='rank')
        elif tuner == 'xgb_knob':
            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
        elif tuner == 'ga':
            tuner_obj = GATuner(tsk, pop_size=50)
        elif tuner == 'random':
            tuner_obj = RandomTuner(tsk)
        elif tuner == 'gridsearch':
            tuner_obj = GridSearchTuner(tsk)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        if use_transfer_learning:
            if os.path.isfile(tmp_log_file):
                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
        # do tuning
        tsk_trial = min(n_trial, len(tsk.config_space))
        tuner_obj.tune(n_trial=tsk_trial,
                       early_stopping=early_stopping,
                       measure_option=measure_option,
                       callbacks=[
                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
                           autotvm.callback.log_to_file(tmp_log_file)
                       ])

    # pick best records to a cache file
    autotvm.record.pick_best(tmp_log_file, log_filename)
    os.remove(tmp_log_file)

########################################################################
# Finally, we launch tuning jobs and evaluate the end-to-end performance.

def tune_and_evaluate(tuning_opt):
    # extract workloads from relay program
    print("Extract tasks...")
    mod, params, input_shape, _ = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                              params=params,
                                              ops=(relay.op.get("nn.conv2d"),))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with tvm.transform.PassContext(opt_level=3):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params)

        # export library
        path_dso = "tuned_deploy.dylib"
        lib.export_library(path_dso, xcode.create_dylib, arch=arch, sdk=sdk)
        xcode.codesign(path_dso)

        # Evaluate inference cost on tuned lib
        xcode.popen_test_rpc(proxy_host, proxy_port, device_key, destination=destination, libs=[path_dso])

        remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9190,
                                                timeout=10000)

        # Upload not needed for ios because dylib is built into app
        # remote.upload(path_dso)

        rlib = remote.load_module(path_dso)

        ctx = remote.cpu(0)
        
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=3, repeat=20)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))

# We do not run the tuning in our webpage server since it takes too long.
# Uncomment the following line to run it by yourself.
if __name__ == '__main__':
    if os.path.exists("rpc_config.txt"):
        os.remove("rpc_config.txt")
    tune_and_evaluate(tuning_option)

######################################################################
# Sample Output
# -------------