Is it possible to pull if_then_else statement to outer loop?

The if_then_else in the inner loop can be pulled to outer level if the condition is irrelevant with inner loop var. For example,

produce Tensor {
for (i0, 0, 64) {
    for (i1, 0, 4) {
        Tensor[(i0*4)+i1] = tvm_if_then_else(cond, input0[(i0*4)+i1], 0)
    }
}

can be transformed to

produce Tensor {
    for (i0, 0, 64) {
        Tensor[ramp(i0*4,1,4)] = tvm_if_then_else(cond, input0[ramp(i0*4, 1, 4)], (int4)0)
    }
}

is this possible?

The real problem here is nn.dilate, when we use packed kernel layout (e.g. OIHW4o4i), we cannot use vectorized load for the inner loops (4o,4i) because dilation uses select statement for each element.

Does the condition hold for all values in the vector, or can it be different?

the condition holds for all values in the inner loop

maybe you can use “tensorize” to replace all the stmt to the one you want, but you should check the allocation or buffer size is what you want

This is the check, right?

What happens if you just manually bypass the check?

Actually the simplified example I posted can be vectorized.
The problem I met is

    unrolled (kh, 0, 3) {
      unrolled (kw, 0, 3) {
        unrolled (ic_block.inner, 0, 4) {
          unrolled (vthread.s, 0, 16) {
            compute[(oc_block + (vthread.s*28))] = (compute[(oc_block + (vthread.s*28))] + (int32(pad_data.shared[((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
          }
          unrolled (vthread.s, 0, 16) {
            compute[((oc_block + (vthread.s*28)) + 4)] = (compute[((oc_block + (vthread.s*28)) + 4)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 128)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
          }
          unrolled (vthread.s, 0, 16) {
            compute[((oc_block + (vthread.s*28)) + 8)] = (compute[((oc_block + (vthread.s*28)) + 8)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 256)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
          }
          unrolled (vthread.s, 0, 16) {
            compute[((oc_block + (vthread.s*28)) + 12)] = (compute[((oc_block + (vthread.s*28)) + 12)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 384)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
          }
          unrolled (vthread.s, 0, 16) {
            compute[((oc_block + (vthread.s*28)) + 16)] = (compute[((oc_block + (vthread.s*28)) + 16)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 512)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
          }
          unrolled (vthread.s, 0, 16) {
            compute[((oc_block + (vthread.s*28)) + 20)] = (compute[((oc_block + (vthread.s*28)) + 20)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 640)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
          }
          unrolled (vthread.s, 0, 16) {
            compute[((oc_block + (vthread.s*28)) + 24)] = (compute[((oc_block + (vthread.s*28)) + 24)] + (int32(pad_data.shared[(((((((threadIdx.y*16) + threadIdx.x) + (kh*16)) + kw)*4) + ic_block.inner) + 768)])*int32(tvm_if_then_else((((kh % 2) == 0) && ((kw % 2) == 0)), kernel.shared[((((((((vthread.s*4) + threadIdx.z)*2) + (kh/2))*2) + (kw/2))*4) + ic_block.inner)], (int8)0))))
          }
        }
      }
    }

I expect all the condition in if_then_else to be moved outside and merged to a single one (as they are the same).

Another issue I have when vectorizing loop is:
@tqchen https://github.com/dmlc/tvm/blob/c3b569d2f6055204de406398dbef7702103e3f2e/src/codegen/codegen_cuda.cc#L277
Broadcast is converted to make_xx in cuda, e,g make_int4. But there is not a make_int, which leads to error when we use (x4)int8. We need a way to generate vectorized value in codegen.

This seems to be something that can be patched to the cuda code generator, can you take a stab on that?