Does TVM introduce vthread because StrideSets is not used in InferBound pass?
import tvm
from tvm import te
M = 128
A = te.placeholder((M, ), name= "A")
B = te.compute((M, ), lambda i: A[i], name="B")
C = te.compute((M, ), lambda i: B[i], name="C")
# 1. not vthread
s = te.create_schedule(C.op)
x, = s[C].op.axis
xo, xi = s[C].split(x, factor=4)
s[C].reorder(xi, xo)
s[B].compute_at(s[C], xi)
tir = str(tvm.lower(s, [A, B], simple_mode=True))
print(tir)
# 2. vthread
s = te.create_schedule(C.op)
x, = s[C].op.axis
xo, xi = s[C].split(x, factor=4)
s[C].bind(xo, te.thread_axis("vthread", name="vx"))
s[B].compute_at(s[C], xi)
tir = str(tvm.lower(s, [A, B], simple_mode=True))
print(tir)
primfn(A_1: handle, B_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {B: Buffer(B_2: Pointer(float32), float32, [128], []),
A: Buffer(A_2: Pointer(float32), float32, [128], [])}
buffer_map = {A_1: A, B_1: B} {
attr [C: Pointer(float32)] "storage_scope" = "global";
allocate(C, float32, [128]);
for (i.inner: int32, 0, 4) {
for (i: int32, 0, 125) { // note: should be inferred as for (i: int32, 0, 32)
B_2[(i + i.inner)] = (float32*)A_2[(i + i.inner)]
}
for (i.outer: int32, 0, 32) {
C[((i.outer*4) + i.inner)] = (float32*)B_2[((i.outer*4) + i.inner)]
}
}
}
primfn(A_1: handle, B_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {B: Buffer(B_2: Pointer(float32), float32, [128], []),
A: Buffer(A_2: Pointer(float32), float32, [128], [])}
buffer_map = {A_1: A, B_1: B} {
attr [C: Pointer(float32)] "storage_scope" = "global";
allocate(C, float32, [128]);
for (i.inner: int32, 0, 4) {
for (vx.s: int32, 0, 32) {
B_2[((vx.s*4) + i.inner)] = (float32*)A_2[((vx.s*4) + i.inner)]
}
for (vx.s_1: int32, 0, 32) {
C[((vx.s_1*4) + i.inner)] = (float32*)B_2[((vx.s_1*4) + i.inner)]
}
}
}