Do we have any way to process codegen with more fine grade control?

I post an exmple for intrinsics choosing.

for (i, 0, 65535) {
   C[i] = (A[i] + B[i])
}
Call Engine: veadd_mm
// normal ===stmt cost : 2061.94 (smallest cost) shape : 1x65535
 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// normal and align === stmt cost : 2071.91 shape : 1x65472 
 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, (int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, (int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, (int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// reshape === stmt cost : 131080
 [ tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)65535, "CSR_STRIDE_D", 0, "CSR_STRIDE_S", 0))
 ]

// === stmt cost : 786420 
 [ for (i, 0, (int64)65535) {
  tx.veadd_mm(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 - int64(i)), 1), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
 ]

Call Engine: veadd_mv_dimh
// normal === stmt cost : 3085.91
 [ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65535, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// normal and align === stmt cost : 2069.94
 [ tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)0, (int64)65535, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)0, (int64)65535, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)0, (int64)65535, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)65472, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, (int64)65472, (int64)63, 2), tir.tvm_access_ptr(tir.type_annotation(), B, (int64)65472, (int64)63, 1), tir.tvm_access_ptr(tir.type_annotation(), A, (int64)65472, (int64)63, 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)63, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
 ]

// === stmt cost : 720885
 [ for (i, 0, (int64)65535) {
  tx.veadd_mv_dimh(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), tir.tvm_access_ptr(tir.type_annotation(), A, int64(i), ((int64)65535 - int64(i)), 1), tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
 ]
Call Engine: veadd_mf
// === stmt cost : 720885
 [ for (i, 0, (int64)65535) {
  tx.veadd_mf(tir.tvm_access_ptr(tir.type_annotation(), C, int64(i), ((int64)65535 - int64(i)), 2), tir.tvm_access_ptr(tir.type_annotation(), B, int64(i), ((int64)65535 - int64(i)), 1), A[i], tx.csrw("CSR_SHAPE_S1_COL", (int64)1, "CSR_SHAPE_S1_ROW", (int64)1, "CSR_STRIDE_D", (int64)0, "CSR_STRIDE_S", (int64)0))
}
 ]

So we need a big module(lots of design and code) to emit intrinsics, tensorization at the first place doesn’t fit well for NPUs.