I post an exmple for intrinsics choosing.
for (i, 0, 65535) {
C[i] = (A[i] + B[i])
}
Call Engine: veadd_mm
// normal ===stmt cost : 2061.94 (smallest cost) shape : 1x65535
[ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// normal and align === stmt cost : 2071.91 shape : 1x65472
[ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, (int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, (int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, (int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// reshape === stmt cost : 131080
[ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)65535, "CSR_STRIDE_D", 0, "CSR_STRIDE_S", 0))
]
// === stmt cost : 786420
[ for (i, 0, (int64)65535) {
tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 - int64(i)), 1), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
]
Call Engine: veadd_mv_dimh
// normal === stmt cost : 3085.91
[ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// normal and align === stmt cost : 2069.94
[ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, (int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, (int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, (int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
]
// === stmt cost : 720885
[ for (i, 0, (int64)65535) {
tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 - int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
]
Call Engine: veadd_mf
// === stmt cost : 720885
[ for (i, 0, (int64)65535) {
tx.veadd_mf(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), A[i], tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
]
So we need a big module(lots of design and code) to emit intrinsics, tensorization at the first place doesn’t fit well for NPUs.