How does reorder work?

I am writing a matrix multiply example, just use block tricks to accelerate:

def matmul(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # factor change here
    yo, yi = s[C].split(y, 32)
    xo, xi = s[C].split(x, 32)
    
    s[C].bind(yo, tvm.thread_axis("blockIdx.x"))
    s[C].bind(yi, tvm.thread_axis("threadIdx.x"))
    s[C].bind(xo, tvm.thread_axis("blockIdx.y"))
    s[C].bind(xi, tvm.thread_axis("threadIdx.y"))

    return s, [A, B, C]

after building the function, and I get the following cuda code:

extern "C" __global__ void default_function_kernel0( float* __restrict__ C,  float* __restrict__ A,  float* __restrict__ B) {
  C[((((((int)blockIdx.x) * 8192) + (((int)threadIdx.x) * 256)) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))] = 0.000000e+00f;
  for (int k = 0; k < 128; ++k) {
    C[((((((int)blockIdx.x) * 8192) + (((int)threadIdx.x) * 256)) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))] = (C[((((((int)blockIdx.x) * 8192) + (((int)threadIdx.x) * 256)) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))] + (A[(((((int)blockIdx.x) * 4096) + (((int)threadIdx.x) * 128)) + k)] * B[(((k * 256) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))]));
  }
}

However, I think it might be better if reorder blockIdx.y and threadIdx.x, so, I add the reorder after binding the thread_axis:

def matmul(N, L, M, dtype):
    A = tvm.placeholder((N, L), name='A', dtype=dtype)
    B = tvm.placeholder((L, M), name='B', dtype=dtype)

    k = tvm.reduce_axis((0, L), name='k')
    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
    s = tvm.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    # factor change here
    yo, yi = s[C].split(y, 32)
    xo, xi = s[C].split(x, 32)
    
    s[C].bind(yo, tvm.thread_axis("blockIdx.x"))
    s[C].bind(yi, tvm.thread_axis("threadIdx.x"))
    s[C].bind(xo, tvm.thread_axis("blockIdx.y"))
    s[C].bind(xi, tvm.thread_axis("threadIdx.y"))

    # add reorder here 
    s[C].reorder(yo, xo, k, yi, xi)

    print(tvm.lower(s, [A, B, C], simple_mode=True))
    print("*" * 30)
    return s, [A, B, C]

now, when I build the function, I occur the following error:

Traceback (most recent call last):
  File "/home/nishome/jjzhou/tvm/nnvm/tutorials/matmul.py", line 37, in <module>
    fcuda = tvm.build(s, data, "cuda")
  File "/home/nishome/jjzhou/tvm/python/tvm/build_module.py", line 569, in build
    binds=binds)
  File "/home/nishome/jjzhou/tvm/python/tvm/build_module.py", line 412, in lower
    return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
  File "tvm/_ffi/_cython/./function.pxi", line 310, in tvm._ffi._cy3.core.FunctionBase.__call__
  File "tvm/_ffi/_cython/./function.pxi", line 255, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/./base.pxi", line 170, in tvm._ffi._cy3.core.CALL
tvm._ffi.base.TVMError: Traceback (most recent call last):
  [bt] (3) /home/nishome/jjzhou/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f78b5fa68c1]
  [bt] (2) /home/nishome/jjzhou/tvm/build/libtvm.so(+0x884bc0) [0x7f78b57f5bc0]
  [bt] (1) /home/nishome/jjzhou/tvm/build/libtvm.so(+0xad778f) [0x7f78b5a4878f]
  [bt] (0) /home/nishome/jjzhou/tvm/build/libtvm.so(+0x842ca2) [0x7f78b57b3ca2]
  File "/home/nishome/jjzhou/tvm/src/pass/make_api.cc", line 188
TVMError: Not all Vars are passed in api_args:  'i.inner.init'  'j.inner.init'  does not appeared in api_args

and tvm.lower get :

produce C {
  // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 16
  // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 8
  // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 32
  // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
  C[((((((blockIdx.x*32) + i.inner.init)*8) + blockIdx.y)*32) + j.inner.init)] = 0.000000f
  for (k, 0, 128) {
    // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 32
    // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
    C[((((((blockIdx.x*32) + threadIdx.x)*8) + blockIdx.y)*32) + threadIdx.y)] = (C[((((((blockIdx.x*32) + threadIdx.x)*8) + blockIdx.y)*32) + threadIdx.y)] + (A[((((blockIdx.x*32) + threadIdx.x)*128) + k)]*B[((((k*8) + blockIdx.y)*32) + threadIdx.y)]))
  }
}

I don’t know why the code will produce i.inner.int and j.inner.init , who can tell me what are they?Thanks!