The problem may share some similarities with Autocheduler error for NHWC + ARM back-end. However, the difference is that I am just using fp32, but not a quantized model.
I am trying to perform auto-tuning on RK3399 with the following code:
from tvm.contrib import graph_executor
from tvm.contrib.utils import tempdir
from tvm import relay, auto_scheduler
import tvm.relay.testing
import tflite
import numpy as np
device_key = "firefly"
rpc_host = "192.168.0.1"
rpc_port = 9191
runner = auto_scheduler.RPCRunner(
device_key,
host=rpc_host,
port=rpc_port,
timeout=10,
repeat=3,
min_repeat_ms=200,
enable_cpu_cache_flush=True,
)
target = tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon")
mod, params = tvm.relay.frontend.from_tflite(
tflite.Model.GetRootAsModel(open('resnet50_v1.no-argmax.tflite', "rb").read(), 0))
input_shape = (1, 224, 224, 3)
output_shape = (1, 1001)
log_file = "resnet50_v1.no-argmax.json"
print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
for idx, task in enumerate(tasks):
print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))
print(task.compute_dag)
def run_tuning():
tuner = auto_scheduler.TaskScheduler(
tasks,
task_weights,
load_log_file=log_file,
)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=10000, # change this to 20000 to achieve the best performance
runner=runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)
tuner.tune(tune_option)
run_tuning()
and the model file ‘resnet50_v1.no-argmax.tflite’ is downloaded from https://www.dropbox.com/s/vhuqo0wc39lky0a/.
I can constantly see error messages like this when tuning:
[13:08:36] /home/mashplant/Compiler/tvm/src/auto_scheduler/compute_dag.cc:1371: Warning: InferBound fails on the state:
Placeholder: placeholder, placeholder, placeholder
parallel oco@kh@ (None)
for kw (None)
for ic (None)
vectorize oci (None)
kernel_vec = ...
parallel n.0@oho.0@owo.0@oco.0@ohi.0@owi.0@oci.0@ (0,14)
conv.local auto_unroll: 16
for n_c.0 (None)
for oho_c.0 (None)
for owo_c.0 (None)
for oco_c.0 (None)
for ohi_c.0 (None)
for owi_c.0 (None)
for oci_c.0 (None)
for n_c.1 (None)
for oho_c.1 (None)
for n (None)
for oho (None)
for owo (None)
for ohi (None)
for owi (None)
vectorize ic (None)
data_vec = ...
for owo_c.1 (None)
for i0 (None)
for i1 (None)
for i2 (None)
vectorize i3 (None)
PadInput = ...
for oco_c.1 (None)
for ohi_c.1 (None)
for owi_c.1 (None)
for oci_c.1 (None)
for ic.0 (None)
for kh.0 (None)
for kw.0 (None)
for n_c.2 (None)
for oho_c.2 (None)
for owo_c.2 (None)
for oco_c.2 (None)
for ohi_c.2 (None)
for owi_c.2 (None)
for oci_c.2 (None)
for ic.1 (None)
for kh.1 (None)
for kw.1 (None)
for n_c.3 (None)
for oho_c.3 (None)
for owo_c.3 (None)
for oco_c.3 (None)
for ohi_c.3 (None)
for owi_c.3 (None)
vectorize oci_c.3 (None)
conv.local = ...
for oho.1 (0,28)
for owo.1 (0,2)
for oco.1 (0,128)
conv = ...
parallel ax0@ax1@ax2@ (0,784)
for ax3 (0,128)
T_relu = ...
with: [13:08:36] /home/mashplant/Compiler/tvm/src/te/schedule/bound.cc:175:
---------------------------------------------------------------
An internal invariant was violated during the execution of TVM.
Please read TVM's error reporting guidelines.
More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.
---------------------------------------------------------------
Check failed: (found_attach || stage_attach.size() == 0) is false: Invalid Schedule, cannot find the producer compute(PadInput, body=[tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)], axis=[iter_var(i0, range(min=0, ext=1)), iter_var(i1, range(min=0, ext=30)), iter_var(i2, range(min=0, ext=30)), iter_var(i3, range(min=0, ext=128))], reduce_axis=[], tag=injective,pad, attrs={}) along the loop nest specified by compute_at of consumer compute(data_vec, body=[PadInput[n, (oho + ohi), (owo + owi), ic]], axis=[iter_var(n, range(min=0, ext=1)), iter_var(oho, range(min=0, ext=28)), iter_var(owo, range(min=0, ext=28)), iter_var(ohi, range(min=0, ext=3)), iter_var(owi, range(min=0, ext=3)), iter_var(ic, range(min=0, ext=128))], reduce_axis=[], tag=, attrs={})
Stack trace:
0: tvm::te::InferRootBound(tvm::te::Stage const&, tvm::te::GraphContext const&, std::unordered_map<tvm::tir::IterVar, tvm::Range, std::hash<tvm::tir::IterVar>, std::equal_to<tvm::tir::IterVar>, std::allocator<std::pair<tvm::tir::IterVar const, tvm::Range> > >*)
1: tvm::te::InferBound(tvm::te::Schedule const&)
2: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::auto_scheduler::State const&) const
3: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::runtime::Array<tvm::auto_scheduler::State, void> const&) const::{lambda(int)#1}::operator()(int) const
4: _ZNSt17_Function_handlerIFSt10unique_ptrINSt13__future_base12_Result_baseENS2_8_DeleterEEvENS1_12_Task_setterIS0_INS1_7_ResultIvEES3_EZNS1_11_Task_stateIZN3tvm7suppor
5: std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)
6: __pthread_once_slow
7: std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::packaged_task<void (std::vector<int, std::allocator<int> > const&, std::function<void (int)> const&)>, std::vector<int, std::allocator<int> >, std::function<void (int)> > > >::_M_run()
8: execute_native_thread_routine
at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
9: start_thread
10: __GI___clone
11: 0xffffffffffffffff
Both the host and the remote machine have TVM commit id 36b7dd949e26fa7522a69919b6394bcd2eeb5700.
UPDATE
I now realized that the problem is not limited to ARM backend, as I just replicated it on a X86_64 machine with the following code:
from tvm.contrib import graph_executor
from tvm.contrib.utils import tempdir
from tvm import relay, auto_scheduler
import tvm.relay.testing
import tflite
import numpy as np
import multiprocessing.popen_spawn_posix
target = tvm.target.Target("llvm -mcpu=core-avx2")
mod, params = tvm.relay.frontend.from_tflite(
tflite.Model.GetRootAsModel(open('resnet50_v1.no-argmax.tflite', "rb").read(), 0))
input_shape = (1, 224, 224, 3)
output_shape = (1, 1001)
dtype = "float32"
log_file = "resnet50_v1.no-argmax-local.json"
print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
for idx, task in enumerate(tasks):
print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))
print(task.compute_dag)
def run_tuning():
tuner = auto_scheduler.TaskScheduler(
tasks,
task_weights,
load_log_file=log_file,
)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=10000, # change this to 20000 to achieve the best performance
runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)
tuner.tune(tune_option)
run_tuning()
And the error is like:
[20:02:57] /home/mashplant/Compiler/tvm/src/auto_scheduler/compute_dag.cc:1371: Warning: InferBound fails on the state:
Placeholder: placeholder, placeholder, placeholder
parallel p.0@ci.0@ (0,896)
for eps (None)
for nu (None)
for p (None)
vectorize ci (None)
input_tile = ...
for p.1 (0,7)
for i0 (None)
for i1 (None)
for i2 (None)
vectorize i3 (None)
data_pad = ...
unroll eps (0,6)
unroll nu (0,6)
unroll r_a (0,6)
unroll r_b (0,6)
data_pack = ...
parallel eps.0@nu.0@p.0@co.0@ (0,2)
bgemm.local auto_unroll: 512
for eps_c.0 (None)
for nu_c.0 (None)
for p_c.0 (None)
for co_c.0 (None)
for eps_c.1 (None)
for nu_c.1 (None)
for p_c.1 (None)
for co_c.1 (None)
for ci.0 (None)
for eps_c.2 (None)
for nu_c.2 (None)
for p_c.2 (None)
for co_c.2 (None)
for ci.1 (None)
for eps_c.3 (None)
for nu_c.3 (None)
for p_c.3 (None)
vectorize co_c.3 (None)
bgemm.local = ...
for eps.1 (0,6)
for nu.1 (0,3)
for p.1 (0,49)
for co.1 (0,128)
bgemm = ...
inverse auto_unroll: 512
parallel p.0@co.0@p.1@ (0,392)
for co.1 (0,16)
unroll vh (0,4)
unroll vw (0,4)
unroll r_a (0,6)
unroll r_b (0,6)
inverse = ...
parallel n@h@w@ (None)
for co (None)
conv2d_winograd = ...
parallel ax0@ax1@ax2@ (None)
for ax3 (None)
T_relu = ...
with: [20:02:57] /home/mashplant/Compiler/tvm/src/te/schedule/bound.cc:175:
---------------------------------------------------------------
An internal invariant was violated during the execution of TVM.
Please read TVM's error reporting guidelines.
More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.
---------------------------------------------------------------
Check failed: (found_attach || stage_attach.size() == 0) is false: Invalid Schedule, cannot find the producer compute(data_pad, body=[tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)], axis=[iter_var(i0, range(min=0, ext=1)), iter_var(i1, range(min=0, ext=30)), iter_var(i2, range(min=0, ext=30)), iter_var(i3, range(min=0, ext=128))], reduce_axis=[], tag=injective,pad, attrs={}) along the loop nest specified by compute_at of consumer compute(input_tile, body=[data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*4) + eps), ((floormod(p, 7)*4) + nu), ci]], axis=[iter_var(eps, range(min=0, ext=6)), iter_var(nu, range(min=0, ext=6)), iter_var(p, range(min=0, ext=49)), iter_var(ci, range(min=0, ext=128))], reduce_axis=[], tag=, attrs={})
Stack trace:
0: tvm::te::InferRootBound(tvm::te::Stage const&, tvm::te::GraphContext const&, std::unordered_map<tvm::tir::IterVar, tvm::Range, std::hash<tvm::tir::IterVar>, std::equal_to<tvm::tir::IterVar>, std::allocator<std::pair<tvm::tir::IterVar const, tvm::Range> > >*)
1: tvm::te::InferBound(tvm::te::Schedule const&)
2: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::auto_scheduler::State const&) const
3: tvm::auto_scheduler::ComputeDAG::InferBound(tvm::runtime::Array<tvm::auto_scheduler::State, void> const&) const::{lambda(int)#1}::operator()(int) const
4: _ZNSt17_Function_handlerIFSt10unique_ptrINSt13__future_base12_Result_baseENS2_8_DeleterEEvENS1_12_Task_setterIS0_INS1_7_ResultIvEES3_EZNS1_11_Task_stateIZN3tvm7suppor
5: std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*)
6: __pthread_once_slow
7: std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::packaged_task<void (std::vector<int, std::allocator<int> > const&, std::function<void (int)> const&)>, std::vector<int, std::allocator<int> >, std::function<void (int)> > > >::_M_run()
8: execute_native_thread_routine
at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
9: start_thread
10: __GI___clone
11: 0xffffffffffffffff
The DAG of this task is:
========== Task 14 (workload key: ["ecec634b4882c5731f86cce3109db636", 1, 28, 28, 128, 6, 6, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
placeholder = PLACEHOLDER [1, 28, 28, 128]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*4) + eps), ((floormod(p, 7)*4) + nu), ci]
B(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 6) == 5)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 6) == 4)), ..(OMITTED).. (floormod(j, 6) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 6) == 0)), 1f, 0f))))))))))))))))))))))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [6, 6, 128, 128]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 4) == 2)), ..(OMITTED).. 6) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*7)*7) + (floordiv(h, 4)*7)) + floordiv(w, 4)), co]
placeholder = PLACEHOLDER [1, 1, 1, 128]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)