The Matmul example in the auto tunning only open the N, M dim tunning space, tried to open the L dim tuning space of Matmul with following code, but does not work, what is the issue here?
@autotvm.template
def matmul(N, L, M, dtype):
A = tvm.placeholder((N, L), name='A', dtype=dtype)
B = tvm.placeholder((L, M), name='B', dtype=dtype)
k = tvm.reduce_axis((0, L), name='k')
C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
s = tvm.create_schedule(C.op)
y, x = s[C].op.axis
k = s[C].op.reduce_axis[0]
##### define space begin #####
cfg = autotvm.get_config()
cfg.define_split("tile_y", y, num_outputs=2)
cfg.define_split("tile_x", x, num_outputs=2)
cfg.define_split("tile_k", k, num_outputs=3)
output = C
OL = s.cache_write(C, 'local')
# create cache stage
AA = s.cache_read(A, 'shared', [OL])
WW = s.cache_read(B, 'shared', [OL])
AL = s.cache_read(AA, 'local', [OL])
WL = s.cache_read(WW, 'local', [OL])
# schedule according to config
yo, yi = cfg["tile_y"].apply(s, C, y)
xo, xi = cfg["tile_x"].apply(s, C, x)
######### bind ###########
s[C].bind(yo,tvm.thread_axis("blockIdx.y"))
s[C].bind(xo,tvm.thread_axis("blockIdx.x"))
s[C].bind(yi,tvm.thread_axis("threadIdx.y"))
s[C].bind(xi,tvm.thread_axis("threadIdx.x"))
s[C].reorder(yo, xo, yi, xi)
s[OL].compute_at(s[C],xi)
# tile reduction axes
y, x = s[OL].op.axis
k = s[OL].op.reduce_axis[0]
ko,km, ki = cfg['tile_k'].apply(s, OL, k)
s[OL].reorder(ko, km, ki, y, x)
s[AA].compute_at(s[OL], ko)
s[WW].compute_at(s[OL], ko)
s[AL].compute_at(s[OL], km)
s[WL].compute_at(s[OL], km)
# cooperative fetching
for load in [AA, WW]:
y, x = s[load].op.axis
fused = s[load].fuse(y, x)
ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[0])
tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[0])
s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
######### bind end #######
return s, [A, B, C]