I see that you can look into the VTA instructions, which means you’ve looked into the design quite a bit.
I assume you’ve read the technical references, and the tech report on VTA as well?
The key to understand how TVM produces VTA code is to start from the lowered TVM schedule (see the matrix multiplication example that you linked) and how you end up with a lowered schedule like this one:
// attr [C_buf] storage_scope = "local.acc_buffer"
// attr [A_buf] storage_scope = "local.inp_buffer"
// attr [B_buf] storage_scope = "local.wgt_buffer"
produce C_buf {
// attr [iter_var(vta, , vta)] coproc_scope = 2
// attr [iter_var(vta, , vta)] coproc_uop_scope = "VTAPushGEMMOp"
VTAUopLoopBegin(16, 1, 0, 0)
VTAUopPush(0, 1, 0, 0, 0, 0, 0, 0)
VTAUopLoopEnd()
vta.coproc_dep_push(2, 1)
for (ko, 0, 16) {
// attr [iter_var(vta, , vta)] coproc_scope = 1
vta.coproc_dep_pop(2, 1)
produce A_buf {
VTALoadBuffer2D(tvm_thread_context(VTATLSCommandHandle()), A, ko, 1, 1, 1, 0, 0, 0, 0, 0, 2)
}
produce B_buf {
VTALoadBuffer2D(tvm_thread_context(VTATLSCommandHandle()), B, ko, 1, 16, 16, 0, 0, 0, 0, 0, 1)
}
vta.coproc_dep_push(1, 2)
// attr [iter_var(vta, , vta)] coproc_scope = 2
vta.coproc_dep_pop(1, 2)
// attr [iter_var(vta, , vta)] coproc_uop_scope = "VTAPushGEMMOp"
VTAUopLoopBegin(16, 1, 0, 1)
VTAUopPush(0, 0, 0, 0, 0, 0, 0, 0)
VTAUopLoopEnd()
vta.coproc_dep_push(2, 1)
}
vta.coproc_dep_push(2, 3)
vta.coproc_dep_pop(2, 1)
}
// attr [iter_var(vta, , vta)] coproc_scope = 3
vta.coproc_dep_pop(2, 3)
produce C {
VTAStoreBuffer2D(tvm_thread_context(VTATLSCommandHandle()), 0, 4, C, 0, 16, 1, 16)
}
vta.coproc_sync()