Error while generating IR

Hi all,

I was successful in generating IR for PYNQ board for a matrix multiplication(tvm/vta/tutorials/matrix_multiply.py) application. When i am generating the IR for the same application in the Linux machine for the cross compiled build i am getting a different IR.

PYNQ IR (matrix_multiply):

from tvm.script import ir as I

from tvm.script import tir as T

@I.ir_module class Module: @T.prim_func def main(A: T.Buffer((1, 16, 1, 16), “int8”), B: T.Buffer((16, 16, 16, 16), “int8”), C: T.Buffer((1, 16, 1, 16), “int8”)): T.func_attr({“from_legacy_te_schedule”: T.bool(True), “tir.noalias”: T.bool(True)}) A_buf = T.allocate([256], “int8”, “global”) B_buf = T.allocate([65536], “int8”, “global”) C_buf = T.allocate([256], “int32”, “global”) A_buf_1 = T.Buffer((256,), “int8”, data=A_buf) for i1, i3 in T.grid(16, 16): cse_var_1: T.int32 = i1 * 16 + i3 A_1 = T.Buffer((256,), “int8”, data=A.data) A_buf_1[cse_var_1] = A_1[cse_var_1] B_buf_1 = T.Buffer((65536,), “int8”, data=B_buf) for i0, i1, i2, i3 in T.grid(16, 16, 16, 16): cse_var_2: T.int32 = i0 * 4096 + i1 * 256 + i2 * 16 + i3 B_1 = T.Buffer((65536,), “int8”, data=B.data) B_buf_1[cse_var_2] = B_1[cse_var_2] C_buf_1 = T.Buffer((256,), “int32”, data=C_buf) for co, ci in T.grid(16, 16): C_buf_1[co * 16 + ci] = 0 for ko, ki in T.grid(16, 16): cse_var_3: T.int32 = co * 16 + ci C_buf_1[cse_var_3] = C_buf_1[cse_var_3] + T.Cast(“int32”, A_buf_1[ko * 16 + ki]) * T.Cast(“int32”, B_buf_1[co * 4096 + ko * 256 + ci * 16 + ki]) for i1, i3 in T.grid(16, 16): cse_var_4: T.int32 = i1 * 16 + i3 C_1 = T.Buffer((256,), “int8”, data=C.data) C_1[cse_var_4] = T.Cast(“int8”, C_buf_1[cse_var_4])

from tvm.script import ir as I

from tvm.script import tir as T

@I.ir_module class Module: @T.prim_func def main(A: T.Buffer((1, 16, 1, 16), “int8”), B: T.Buffer((16, 16, 16, 16), “int8”), C: T.Buffer((1, 16, 1, 16), “int8”)): T.func_attr({“from_legacy_te_schedule”: T.bool(True), “tir.noalias”: T.bool(True)}) vta = T.int32() with T.attr(T.iter_var(vta, None, “ThreadIndex”, “vta”), “coproc_scope”, 2): with T.attr(T.iter_var(vta, None, “ThreadIndex”, “vta”), “coproc_uop_scope”, “VTAPushGEMMOp”): T.call_extern(“int32”, “VTAUopLoopBegin”, 16, 1, 0, 0) T.tir.vta.uop_push(0, 1, 0, 0, 0, 0, 0, 0) T.call_extern(“int32”, “VTAUopLoopEnd”) T.tir.vta.coproc_dep_push(2, 1) for ko in range(16): with T.attr(T.iter_var(vta, None, “ThreadIndex”, “vta”), “coproc_scope”, 1): T.tir.vta.coproc_dep_pop(2, 1) T.call_extern(“int32”, “VTALoadBuffer2D”, T.tvm_thread_context(T.tir.vta.command_handle()), A.data, ko, 1, 1, 1, 0, 0, 0, 0, 0, 2) T.call_extern(“int32”, “VTALoadBuffer2D”, T.tvm_thread_context(T.tir.vta.command_handle()), B.data, ko, 1, 16, 16, 0, 0, 0, 0, 0, 1) T.tir.vta.coproc_dep_push(1, 2) T.attr(T.iter_var(vta, None, “ThreadIndex”, “vta”), “coproc_scope”, 2) T.tir.vta.coproc_dep_pop(1, 2) with T.attr(T.iter_var(vta, None, “ThreadIndex”, “vta”), “coproc_uop_scope”, “VTAPushGEMMOp”): T.call_extern(“int32”, “VTAUopLoopBegin”, 16, 1, 0, 1) T.tir.vta.uop_push(0, 0, 0, 0, 0, 0, 0, 0) T.call_extern(“int32”, “VTAUopLoopEnd”) T.tir.vta.coproc_dep_push(2, 1) T.tir.vta.coproc_dep_push(2, 3) T.tir.vta.coproc_dep_pop(2, 1) with T.attr(T.iter_var(vta, None, “ThreadIndex”, “vta”), “coproc_scope”, 3): T.tir.vta.coproc_dep_pop(2, 3) T.call_extern(“int32”, “VTAStoreBuffer2D”, T.tvm_thread_context(T.tir.vta.command_handle()), 0, 4, C.data, 0, 16, 1, 16) T.tir.vta.coproc_sync()

Cross compiled IR (matrix_multiply):

from tvm.script import ir as I

from tvm.script import tir as T

@I.ir_module class Module: @T.prim_func def main(A: T.Buffer((1, 16, 1, 16), “int8”), B: T.Buffer((16, 16, 16, 16), “int8”), C: T.Buffer((1, 16, 1, 16), “int8”)): T.func_attr({“from_legacy_te_schedule”: T.bool(True), “tir.noalias”: T.bool(True)}) A_buf = T.allocate([256], “int8”, “global”) B_buf = T.allocate([65536], “int8”, “global”) C_buf = T.allocate([256], “int32”, “global”) A_buf_1 = T.Buffer((256,), “int8”, data=A_buf) for i1, i3 in T.grid(16, 16): cse_var_1: T.int32 = i1 * 16 + i3 A_1 = T.Buffer((256,), “int8”, data=A.data) A_buf_1[cse_var_1] = A_1[cse_var_1] B_buf_1 = T.Buffer((65536,), “int8”, data=B_buf) for i0, i1, i2, i3 in T.grid(16, 16, 16, 16): cse_var_2: T.int32 = i0 * 4096 + i1 * 256 + i2 * 16 + i3 B_1 = T.Buffer((65536,), “int8”, data=B.data) B_buf_1[cse_var_2] = B_1[cse_var_2] C_buf_1 = T.Buffer((256,), “int32”, data=C_buf) for co, ci in T.grid(16, 16): C_buf_1[co * 16 + ci] = 0 for ko, ki in T.grid(16, 16): cse_var_3: T.int32 = co * 16 + ci C_buf_1[cse_var_3] = C_buf_1[cse_var_3] + T.Cast(“int32”, A_buf_1[ko * 16 + ki]) * T.Cast(“int32”, B_buf_1[co * 4096 + ko * 256 + ci * 16 + ki]) for i1, i3 in T.grid(16, 16): cse_var_4: T.int32 = i1 * 16 + i3 C_1 = T.Buffer((256,), “int8”, data=C.data) C_1[cse_var_4] = T.Cast(“int8”, C_buf_1[cse_var_4])

from tvm.script import ir as I

from tvm.script import tir as T

@I.ir_module class Module: @T.prim_func def main(A: T.Buffer((1, 16, 1, 16), “int8”), B: T.Buffer((16, 16, 16, 16), “int8”), C: T.Buffer((1, 16, 1, 16), “int8”)): T.func_attr({“from_legacy_te_schedule”: T.bool(True), “tir.noalias”: T.bool(True)}) C_buf = T.allocate([256], “int32”, “local.acc_buffer”) A_buf = T.allocate([16], “int8”, “local.inp_buffer”) B_buf = T.allocate([16], “int8”, “local.wgt_buffer”) C_buf_1 = T.Buffer((256,), “int32”, data=C_buf, scope=“local.acc_buffer”, align=16) for co, ci in T.grid(16, 16): C_buf_1[co * 16 + ci] = 0 for ko in range(16): i0 = T.int32() A_buf_1 = T.Buffer((16,), “int8”, data=A_buf, scope=“local.inp_buffer”, align=16) with T.attr(T.iter_var(i0, None, “DataPar”, “”), “pragma_dma_copy”, 1): for i3 in range(16): A_1 = T.Buffer((256,), “int8”, data=A.data) A_buf_1[i3] = A_1[ko * 16 + i3] i0_1 = T.int32() B_buf_1 = T.Buffer((16,), “int8”, data=B_buf, scope=“local.wgt_buffer”, align=256) with T.attr(T.iter_var(i0_1, None, “DataPar”, “”), “pragma_dma_copy”, 1): for i3 in range(16): B_1 = T.Buffer((65536,), “int8”, data=B.data) B_buf_1[i3] = B_1[co * 4096 + ko * 256 + ci * 16 + i3] for ki in range(16): cse_var_1: T.int32 = co * 16 + ci C_buf_1[cse_var_1] = C_buf_1[cse_var_1] + T.Cast(“int32”, A_buf_1[ki]) * T.Cast(“int32”, B_buf_1[ki]) i0 = T.int32() T.attr(T.iter_var(i0, None, “DataPar”, “”), “pragma_dma_copy”, 1) for i1, i3 in T.grid(16, 16): cse_var_2: T.int32 = i1 * 16 + i3 C_1 = T.Buffer((256,), “int8”, data=C.data) C_1[cse_var_2] = T.Cast(“int8”, C_buf_1[cse_var_2])

Do i need to modify any configuration to get the proper IR…?