Actually the simplified example I posted can be vectorized.
The problem I met is
unrolled (kh, 0, 3) {
unrolled (kw, 0, 3) {
unrolled (ic_block.inner, 0, 4) {
unrolled (vthread.s, 0, 16) {
compute[(oc_block + (vthread.s*28))] = (compute[(oc_block + (vthread.s*28))] + (int32(pad_data.shared[((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
}
unrolled (vthread.s, 0, 16) {
compute[((oc_block + (vthread.s*28)) + 4)] = (compute[((oc_block + (vthread.s*28)) + 4)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 128)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
}
unrolled (vthread.s, 0, 16) {
compute[((oc_block + (vthread.s*28)) + 8)] = (compute[((oc_block + (vthread.s*28)) + 8)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 256)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
}
unrolled (vthread.s, 0, 16) {
compute[((oc_block + (vthread.s*28)) + 12)] = (compute[((oc_block + (vthread.s*28)) + 12)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 384)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
}
unrolled (vthread.s, 0, 16) {
compute[((oc_block + (vthread.s*28)) + 16)] = (compute[((oc_block + (vthread.s*28)) + 16)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 512)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
}
unrolled (vthread.s, 0, 16) {
compute[((oc_block + (vthread.s*28)) + 20)] = (compute[((oc_block + (vthread.s*28)) + 20)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 640)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
}
unrolled (vthread.s, 0, 16) {
compute[((oc_block + (vthread.s*28)) + 24)] = (compute[((oc_block + (vthread.s*28)) + 24)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 768)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
}
}
}
}
I expect all the condition in if_then_else to be moved outside and merged to a single one (as they are the same).
Another issue I have when vectorizing loop is:
@tqchen https://github.com/dmlc/tvm/blob/c3b569d2f6055204de406398dbef7702103e3f2e/src/codegen/codegen_cuda.cc#L277
Broadcast is converted to make_xx
in cuda, e,g make_int4. But there is not a make_int, which leads to error when we use (x4)int8. We need a way to generate vectorized value in codegen.