The script above failed with
tvm._ffi.base.TVMError: [17:13:55] /Users/yizhiliu/Workspace/tvm/src/op/tensorize.cc:318: Check failed: Equal(lhs, rhs) Failed to match the compute with TensorIntrin Matmul's declaration provided= reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f]), source=[(placeholder(j, k)*placeholder(0, k))], axis=[iter_var(k, Range(min=0, extent=128))], where=(uint1)1, value_index=0), intrin= reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f]), source=[(placeholder(i, k)*placeholder(j, k))], axis=[iter_var(k, Range(min=0, extent=128))], where=(uint1)1, value_index=0)
Stack trace returned 10 entries:
[bt] (0) 0 libtvm.dylib 0x0000000106598afb dmlc::StackTrace() + 299
[bt] (1) 1 libtvm.dylib 0x000000010659888f dmlc::LogMessageFatal::~LogMessageFatal() + 47
[bt] (2) 2 libtvm.dylib 0x00000001067b3acc tvm::VerifyTensorizeBody(tvm::ComputeOpNode const*, tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, std::__1::unordered_map<tvm::Tensor, tvm::Array<tvm::Range, void>, std::__1::hash<tvm::Tensor>, std::__1::equal_to<tvm::Tensor>, std::__1::allocator<std::__1::pair<tvm::Tensor const, tvm::Array<tvm::Range, void> > > > const&, tvm::TensorIntrin const&) + 2396
[bt] (3) 3 libtvm.dylib 0x00000001067b4d70 tvm::MakeTensorize(tvm::ComputeOpNode const*, tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, bool) + 544
[bt] (4) 4 libtvm.dylib 0x0000000106795e5c tvm::ComputeOpNode::BuildProvide(tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, bool) const + 348
[bt] (5) 5 libtvm.dylib 0x00000001067f8a5e tvm::schedule::MakePipeline(tvm::Stage const&, std::__1::unordered_map<tvm::IterVar, tvm::Range, std::__1::hash<tvm::IterVar>, std::__1::equal_to<tvm::IterVar>, std::__1::allocator<std::__1::pair<tvm::IterVar const, tvm::Range> > > const&, HalideIR::Internal::Stmt, bool) + 62
[bt] (6) 6 libtvm.dylib 0x00000001067f9e73 tvm::schedule::ScheduleOps(tvm::Schedule, tvm::Map<tvm::IterVar, tvm::Range, void, void>, bool) + 3027
[bt] (7) 7 libtvm.dylib 0x00000001065d613c std::__1::__function::__func<tvm::schedule::$_2, std::__1::allocator<tvm::schedule::$_2>, void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&) + 204
[bt] (8) 8 libtvm.dylib 0x0000000106972eb8 TVMFuncCall + 72
[bt] (9) 9 libffi.6.dylib 0x00000001056d9884 ffi_call_unix64 + 76
if I change https://gist.github.com/yzhliu/7a694a99be7f11ebfbf2d634111b6175#file-conv2d_tensorize-py-L33-L34 to
out = tvm.compute((16,),
lambda i: tvm.sum(inp(i, k) * wgt(0, k), axis=[k]))
it fails with
tvm._ffi.base.TVMError: [17:21:16] /Users/yizhiliu/Workspace/tvm/src/pass/arg_binder.cc:93: Check failed: is_zero(value->elem_offset) Trying to bind a Buffer with offset into one without offset
Stack trace returned 10 entries:
[bt] (0) 0 libtvm.dylib 0x00000001018acafb dmlc::StackTrace() + 299
[bt] (1) 1 libtvm.dylib 0x00000001018ac88f dmlc::LogMessageFatal::~LogMessageFatal() + 47
[bt] (2) 2 libtvm.dylib 0x00000001019beaca tvm::ir::ArgBinder::BindBuffer(tvm::Buffer const&, tvm::Buffer const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, bool) + 970
[bt] (3) 3 libtvm.dylib 0x0000000101a7e958 tvm::ir::StorageFlattener::HandleBufferBindScope(HalideIR::Internal::AttrStmt const*) + 4568
[bt] (4) 4 libtvm.dylib 0x0000000101a774f4 tvm::ir::StorageFlattener::Mutate_(HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&) + 852
[bt] (5) 5 libtvm.dylib 0x0000000101a06625 std::__1::__function::__func<tvm::ir::$_1, std::__1::allocator<tvm::ir::$_1>, HalideIR::Internal::Stmt (HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(HalideIR::Internal::AttrStmt const*&&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*&&) + 21
[bt] (6) 6 libtvm.dylib 0x0000000101a05c04 std::__1::__function::__func<tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>& tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::set_dispatch<HalideIR::Internal::AttrStmt>(std::__1::function<HalideIR::Internal::Stmt (HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>)::'lambda'(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*), std::__1::allocator<tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>& tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::set_dispatch<HalideIR::Internal::AttrStmt>(std::__1::function<HalideIR::Internal::Stmt (HalideIR::Internal::AttrStmt const*, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>)::'lambda'(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>, HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*&&) + 52
[bt] (7) 7 libtvm.dylib 0x00000001018f5ea9 tvm::IRFunctor<HalideIR::Internal::Stmt (tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*)>::operator()(tvm::NodeRef const&, HalideIR::Internal::Stmt const&, tvm::ir::IRMutator*) const + 377
[bt] (8) 8 libtvm.dylib 0x00000001019c5bb6 tvm::ir::IRMutator::Mutate(HalideIR::Internal::Stmt) + 102
[bt] (9) 9 libtvm.dylib 0x00000001019f8c2b tvm::ir::IRMutator::Mutate_(HalideIR::Internal::For const*, HalideIR::Internal::Stmt const&) + 235
I really get confused with how to make tensorize declaration match the original computing loop. Do we have any tutorials for tensorization?