Autocheduler error for NHWC conv in TFLite model

The problem may share some similarities with Autocheduler error for NHWC + ARM back-end. However, the difference is that I am just using fp32, but not a quantized model.

I am trying to perform auto-tuning on RK3399 with the following code:

from tvm.contrib import graph_executor
from tvm.contrib.utils import tempdir
from tvm import relay, auto_scheduler
import tvm.relay.testing
import tflite
import numpy as np

device_key = "firefly"
rpc_host = "192.168.0.1"
rpc_port = 9191

runner = auto_scheduler.RPCRunner(
    device_key,
    host=rpc_host,
    port=rpc_port,
    timeout=10,
    repeat=3,
    min_repeat_ms=200,
    enable_cpu_cache_flush=True,
)

target = tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon")
mod, params = tvm.relay.frontend.from_tflite(
    tflite.Model.GetRootAsModel(open('resnet50_v1.no-argmax.tflite', "rb").read(), 0))

input_shape = (1, 224, 224, 3)
output_shape = (1, 1001)
log_file = "resnet50_v1.no-argmax.json"

print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    print(task.compute_dag)


def run_tuning():
    tuner = auto_scheduler.TaskScheduler(
        tasks,
        task_weights,
        load_log_file=log_file,
    )
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=10000,  # change this to 20000 to achieve the best performance
        runner=runner,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )
    tuner.tune(tune_option)


run_tuning()

and the model file ‘resnet50_v1.no-argmax.tflite’ is downloaded from https://www.dropbox.com/s/vhuqo0wc39lky0a/.

I can constantly see error messages like this when tuning:

[13:08:36] /home/mashplant/Compiler/tvm/src/auto_scheduler/compute_dag.cc:1371: Warning: InferBound fails on the state:
Placeholder: placeholder, placeholder, placeholder
parallel oco@kh@ (None)
  for kw (None)
    for ic (None)
      vectorize oci (None)
        kernel_vec = ...
parallel n.0@oho.0@owo.0@oco.0@ohi.0@owi.0@oci.0@ (0,14)
  conv.local auto_unroll: 16
  for n_c.0 (None)
    for oho_c.0 (None)
      for owo_c.0 (None)
        for oco_c.0 (None)
          for ohi_c.0 (None)
            for owi_c.0 (None)
              for oci_c.0 (None)
                for n_c.1 (None)
                  for oho_c.1 (None)
                    for n (None)
                      for oho (None)
                        for owo (None)
                          for ohi (None)
                            for owi (None)
                              vectorize ic (None)
                                data_vec = ...
                    for owo_c.1 (None)
                      for i0 (None)
                        for i1 (None)
                          for i2 (None)
                            vectorize i3 (None)
                              PadInput = ...
                      for oco_c.1 (None)
                        for ohi_c.1 (None)
                          for owi_c.1 (None)
                            for oci_c.1 (None)
                              for ic.0 (None)
                                for kh.0 (None)
                                  for kw.0 (None)
                                    for n_c.2 (None)
                                      for oho_c.2 (None)
                                        for owo_c.2 (None)
                                          for oco_c.2 (None)
                                            for ohi_c.2 (None)
                                              for owi_c.2 (None)
                                                for oci_c.2 (None)
                                                  for ic.1 (None)
                                                    for kh.1 (None)
                                                      for kw.1 (None)
                                                        for n_c.3 (None)
                                                          for oho_c.3 (None)
                                                            for owo_c.3 (None)
                                                              for oco_c.3 (None)
                                                                for ohi_c.3 (None)
                                                                  for owi_c.3 (None)
                                                                    vectorize oci_c.3 (None)
                                                                      conv.local = ...
  for oho.1 (0,28)
    for owo.1 (0,2)
      for oco.1 (0,128)
        conv = ...
parallel ax0@ax1@ax2@ (0,784)
  for ax3 (0,128)
    T_relu = ...

with: [13:08:36] /home/mashplant/Compiler/tvm/src/te/schedule/bound.cc:175: 
---------------------------------------------------------------
An internal invariant was violated during the execution of TVM.
Please read TVM's error reporting guidelines.
More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.
---------------------------------------------------------------
  Check failed: (found_attach || stage_attach.size() == 0) is false: Invalid Schedule, cannot find the producer compute(PadInput, body=[tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)], axis=[iter_var(i0, range(min=0, ext=1)), iter_var(i1, range(min=0, ext=30)), iter_var(i2, range(min=0, ext=30)), iter_var(i3, range(min=0, ext=128))], reduce_axis=[], tag=injective,pad, attrs={}) along the loop nest specified by compute_at of consumer compute(data_vec, body=[PadInput[n, (oho + ohi), (owo + owi), ic]], axis=[iter_var(n, range(min=0, ext=1)), iter_var(oho, range(min=0, ext=28)), iter_var(owo, range(min=0, ext=28)), iter_var(ohi, range(min=0, ext=3)), iter_var(owi, range(min=0, ext=3)), iter_var(ic, range(min=0, ext=128))], reduce_axis=[], tag=, attrs={})
Stack trace:
  0: tvm::te::InferRootBound(tvm::te::Stage const&, tvm::te::GraphContext const&, std::unordered_map<tvm::tir::IterVar, tvm::Range, std::hash<tvm::tir::IterVar>, std::equal_to<tvm::tir::IterVar>, std::allocator<std::pair<tvm::tir::IterVar const, tvm::Range> > >*)
  1: tvm::te::InferBound(tvm::te::Schedule const&)
  2: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::auto_scheduler::State const&) const
  3: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::runtime::Array<tvm::auto_scheduler::State, void> const&) const::{lambda(int)#1}::operator()(int) const
  4: _ZNSt17_Function_handlerIFSt10unique_ptrINSt13__future_base12_Result_baseENS2_8_DeleterEEvENS1_12_Task_setterIS0_INS1_7_ResultIvEES3_EZNS1_11_Task_stateIZN3tvm7suppor
  5: std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)
  6: __pthread_once_slow
  7: std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::packaged_task<void (std::vector<int, std::allocator<int> > const&, std::function<void (int)> const&)>, std::vector<int, std::allocator<int> >, std::function<void (int)> > > >::_M_run()
  8: execute_native_thread_routine
        at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
  9: start_thread
  10: __GI___clone
  11: 0xffffffffffffffff

Both the host and the remote machine have TVM commit id 36b7dd949e26fa7522a69919b6394bcd2eeb5700.


UPDATE

I now realized that the problem is not limited to ARM backend, as I just replicated it on a X86_64 machine with the following code:

from tvm.contrib import graph_executor
from tvm.contrib.utils import tempdir
from tvm import relay, auto_scheduler
import tvm.relay.testing
import tflite
import numpy as np
import multiprocessing.popen_spawn_posix

target = tvm.target.Target("llvm -mcpu=core-avx2")
mod, params = tvm.relay.frontend.from_tflite(
    tflite.Model.GetRootAsModel(open('resnet50_v1.no-argmax.tflite', "rb").read(), 0))

input_shape = (1, 224, 224, 3)
output_shape = (1, 1001)
dtype = "float32"
log_file = "resnet50_v1.no-argmax-local.json"

print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    print(task.compute_dag)


def run_tuning():
    tuner = auto_scheduler.TaskScheduler(
        tasks,
        task_weights,
        load_log_file=log_file,
    )
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=10000,  # change this to 20000 to achieve the best performance
        runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )
    tuner.tune(tune_option)


run_tuning()

And the error is like:

[20:02:57] /home/mashplant/Compiler/tvm/src/auto_scheduler/compute_dag.cc:1371: Warning: InferBound fails on the state:
Placeholder: placeholder, placeholder, placeholder
parallel p.0@ci.0@ (0,896)
  for eps (None)
    for nu (None)
      for p (None)
        vectorize ci (None)
          input_tile = ...
  for p.1 (0,7)
    for i0 (None)
      for i1 (None)
        for i2 (None)
          vectorize i3 (None)
            data_pad = ...
    unroll eps (0,6)
      unroll nu (0,6)
        unroll r_a (0,6)
          unroll r_b (0,6)
            data_pack = ...
parallel eps.0@nu.0@p.0@co.0@ (0,2)
  bgemm.local auto_unroll: 512
  for eps_c.0 (None)
    for nu_c.0 (None)
      for p_c.0 (None)
        for co_c.0 (None)
          for eps_c.1 (None)
            for nu_c.1 (None)
              for p_c.1 (None)
                for co_c.1 (None)
                  for ci.0 (None)
                    for eps_c.2 (None)
                      for nu_c.2 (None)
                        for p_c.2 (None)
                          for co_c.2 (None)
                            for ci.1 (None)
                              for eps_c.3 (None)
                                for nu_c.3 (None)
                                  for p_c.3 (None)
                                    vectorize co_c.3 (None)
                                      bgemm.local = ...
  for eps.1 (0,6)
    for nu.1 (0,3)
      for p.1 (0,49)
        for co.1 (0,128)
          bgemm = ...
inverse auto_unroll: 512
parallel p.0@co.0@p.1@ (0,392)
  for co.1 (0,16)
    unroll vh (0,4)
      unroll vw (0,4)
        unroll r_a (0,6)
          unroll r_b (0,6)
            inverse = ...
parallel n@h@w@ (None)
  for co (None)
    conv2d_winograd = ...
parallel ax0@ax1@ax2@ (None)
  for ax3 (None)
    T_relu = ...

with: [20:02:57] /home/mashplant/Compiler/tvm/src/te/schedule/bound.cc:175: 
---------------------------------------------------------------
An internal invariant was violated during the execution of TVM.
Please read TVM's error reporting guidelines.
More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.
---------------------------------------------------------------
  Check failed: (found_attach || stage_attach.size() == 0) is false: Invalid Schedule, cannot find the producer compute(data_pad, body=[tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)], axis=[iter_var(i0, range(min=0, ext=1)), iter_var(i1, range(min=0, ext=30)), iter_var(i2, range(min=0, ext=30)), iter_var(i3, range(min=0, ext=128))], reduce_axis=[], tag=injective,pad, attrs={}) along the loop nest specified by compute_at of consumer compute(input_tile, body=[data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*4) + eps), ((floormod(p, 7)*4) + nu), ci]], axis=[iter_var(eps, range(min=0, ext=6)), iter_var(nu, range(min=0, ext=6)), iter_var(p, range(min=0, ext=49)), iter_var(ci, range(min=0, ext=128))], reduce_axis=[], tag=, attrs={})
Stack trace:
  0: tvm::te::InferRootBound(tvm::te::Stage const&, tvm::te::GraphContext const&, std::unordered_map<tvm::tir::IterVar, tvm::Range, std::hash<tvm::tir::IterVar>, std::equal_to<tvm::tir::IterVar>, std::allocator<std::pair<tvm::tir::IterVar const, tvm::Range> > >*)
  1: tvm::te::InferBound(tvm::te::Schedule const&)
  2: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::auto_scheduler::State const&) const
  3: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::runtime::Array<tvm::auto_scheduler::State, void> const&) const::{lambda(int)#1}::operator()(int) const
  4: _ZNSt17_Function_handlerIFSt10unique_ptrINSt13__future_base12_Result_baseENS2_8_DeleterEEvENS1_12_Task_setterIS0_INS1_7_ResultIvEES3_EZNS1_11_Task_stateIZN3tvm7suppor
  5: std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)
  6: __pthread_once_slow
  7: std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::packaged_task<void (std::vector<int, std::allocator<int> > const&, std::function<void (int)> const&)>, std::vector<int, std::allocator<int> >, std::function<void (int)> > > >::_M_run()
  8: execute_native_thread_routine
        at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
  9: start_thread
  10: __GI___clone
  11: 0xffffffffffffffff

The DAG of this task is:

========== Task 14  (workload key: ["ecec634b4882c5731f86cce3109db636", 1, 28, 28, 128, 6, 6, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
placeholder = PLACEHOLDER [1, 28, 28, 128]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*4) + eps), ((floormod(p, 7)*4) + nu), ci]
B(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 6) == 5)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 6) == 4)),  ..(OMITTED)..  (floormod(j, 6) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 6) == 0)), 1f, 0f))))))))))))))))))))))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [6, 6, 128, 128]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 4) == 2)),  ..(OMITTED)..  6) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*7)*7) + (floordiv(h, 4)*7)) + floordiv(w, 4)), co]
placeholder = PLACEHOLDER [1, 1, 1, 128]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)