[rfactor] Check failed: !load_remap_.count(buffers[idx])

I am trying to use rfactor in convolution, but it seems I didn’t use it properly, the lowered results are below, could anybody give me some hints?

produce compute {
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 256
// attr [compute.local] storage_scope = “local”
allocate compute.local[float32 * 7]
// attr [compute.local.rf] storage_scope = “local”
allocate compute.local.rf[float32 * 2]
// attr [compute.shared] storage_scope = “shared”
allocate compute.shared[float32 * 54]
// attr [placeholder.shared] storage_scope = “shared”
allocate placeholder.shared[float32 * 72]
// attr [compute.shared.local] storage_scope = “local”
allocate compute.shared.local[float32 * 1]
// attr [placeholder.shared.local] storage_scope = “local”
allocate placeholder.shared.local[float32 * 2]
// attr [reduce_temp0] storage_scope = “local”
allocate reduce_temp0[float32 * 1]
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 14
produce compute.local {
for (ax2, 0, 7) {
// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 3
produce compute.local.rf {
compute.local.rf[0] = 0.000000f
compute.local.rf[1] = 0.000000f
for (rv.inner.outer, 0, 512) {
produce compute.shared {
for (ax0.ax1.fused.ax2.fused.ax3.fused.inner, 0, 4) {
if (likely(((threadIdx.x 4) < (54 - ax0.ax1.fused.ax2.fused.ax3.fused.inner)))) {
compute.shared[((((((threadIdx.x
4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/54) 54) + ((((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/9) % 3) 9) + (((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner) % 9))) + (((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/27) % 2) 27))] = tvm_if_then_else((((((1 - ((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/9) % 3)) <= ax2) && (ax2 < (8 - ((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/9) % 3)))) && (1 <= (((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner) % 9))) && ((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner) % 9) < 8)), placeholder[(((((((ax2 + (rv.inner.outer 14)) + ((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/54) 7168)) 7) + (((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner) % 9)) + (((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/9) % 3) 7)) + (((((threadIdx.x 4) + ax0.ax1.fused.ax2.fused.ax3.fused.inner)/27) % 2) 49)) + -8)], 0.000000f)
}
}
}
produce placeholder.shared {
for (ax0.ax1.fused.ax2.fused.ax3.fused.inner, 0, 6) {
if (likely(((threadIdx.x
2) < (24 - (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3))))) {
if (likely(((threadIdx.x 6) < (72 - ax0.ax1.fused.ax2.fused.ax3.fused.inner)))) {
if (likely(((blockIdx.x
4) < (1024 - (((threadIdx.x 2) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3))/6))))) {
placeholder.shared[((((((threadIdx.x
2) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3))/6) 18) + (((((threadIdx.x 2) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3)) % 3) 3) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner % 3))) + (((((threadIdx.x 2) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3))/3) % 2) 9))] = placeholder[((((((blockIdx.x 2048) + rv.inner.outer) + ((((threadIdx.x 2) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3))/6) 512)) 18) + (((((threadIdx.x 2) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3)) % 3) 3) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner % 3))) + (((((threadIdx.x 2) + (ax0.ax1.fused.ax2.fused.ax3.fused.inner/3))/3) % 2) 9))]
}
}
}
}
}
for (rv.inner.outer, 0, 3) {
for (rv.inner.inner, 0, 2) {
produce compute.shared.local {
compute.shared.local[0] = compute.shared[((((threadIdx.x % 7) + threadIdx.y) + (rv.inner.outer
9)) + (rv.inner.inner
27))]
}
produce placeholder.shared.local {
placeholder.shared.local[0] = placeholder.shared[(((((threadIdx.x/7) 18) + threadIdx.y) + (rv.inner.outer 3)) + (rv.inner.inner
9))]
placeholder.shared.local[1] = placeholder.shared[((((((threadIdx.x/7) 18) + threadIdx.y) + (rv.inner.outer 3)) + (rv.inner.inner*9)) + 36)]
}
compute.local.rf[0] = (compute.local.rf[0] + (compute.shared.local[0] placeholder.shared.local[0]))
compute.local.rf[1] = (compute.local.rf[1] + (compute.shared.local[0] placeholder.shared.local[1]))
}
}
}
}
// attr [comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f])] reduce_scope = reinterpret((uint64)0)
tvm_thread_allreduce((uint32)1, compute.local.rf[0], (uint1)1, reduce_temp0, threadIdx.y)
tvm_thread_allreduce((uint32)1, compute.local.rf[1], (uint1)1, reduce_temp0, threadIdx.y)
compute.local[ax2] = reduce_temp0[0]
}
}
for (b.inner.inner.inner.c.inner.inner.inner.fused.h.inner.inner.inner.fused.w.inner.inner.inner.fused, 0, 7) {
compute[((((blockIdx.x
4) + (threadIdx.x/7)) 49) + ((b.inner.inner.inner.c.inner.inner.inner.fused.h.inner.inner.inner.fused.w.inner.inner.inner.fused 7) + (threadIdx.x % 7)))] = compute.local[b.inner.inner.inner.c.inner.inner.inner.fused.h.inner.inner.inner.fused.w.inner.inner.inner.fused]
compute[(((((blockIdx.x
4) + (threadIdx.x/7)) 49) + ((b.inner.inner.inner.c.inner.inner.inner.fused.h.inner.inner.inner.fused.w.inner.inner.inner.fused 7) + (threadIdx.x % 7))) + 98)] = compute.local[b.inner.inner.inner.c.inner.inner.inner.fused.h.inner.inner.inner.fused.w.inner.inner.inner.fused]
}
}

Traceback (most recent call last):
File “others/schedule_conv2d_nchw_cuda.py”, line 158, in
try_yolo_conv(batch_size=1)
File “others/schedule_conv2d_nchw_cuda.py”, line 153, in try_yolo_conv
time_cost = _evaluate(s, arg_bufs, “cuda”, dev_id, 100)
File “/home/measure.py”, line 119, in _evaluate
raise e
File “/home/measure.py”, line 106, in _evaluate
func = tvm.build(s, bufs, target)
File “/home/tvm/python/tvm/build_module.py”, line 601, in build
fhost, mdev = _build_for_device(flist, tar, target_host)
File “/home/tvm/python/tvm/build_module.py”, line 437, in _build_for_device
func = ir_pass.LowerThreadAllreduce(func, warp_size)
File “/home/tvm/python/tvm/_ffi/_ctypes/function.py”, line 185, in call
ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
File “/home/tvm/python/tvm/_ffi/base.py”, line 71, in check_call
raise TVMError(py_str(_LIB.TVMGetLastError()))
tvm. ffi.base.TVMError: [10:44:58] /home/tvm/src/pass/lower_thread_allreduce.cc:190: Check failed: !load_remap .count(buffers[idx])

Stack trace returned 10 entries:
[bt] (0) /home/tvm/build/libtvm.so(+0x8e4024) [0x7f28e679d024]
[bt] (1) /home/tvm/build/libtvm.so(+0x8e469d) [0x7f28e679d69d]
[bt] (2) /home/tvm/build/libtvm.so(+0xaf8d85) [0x7f28e69b1d85]
[bt] (3) /home/tvm/build/libtvm.so(+0xafa235) [0x7f28e69b3235]
[bt] (4) /home/tvm/build/libtvm.so(+0xab8ae0) [0x7f28e6971ae0]
[bt] (5) /home/tvm/build/libtvm.so(+0xac1665) [0x7f28e697a665]
[bt] (6) /home/tvm/build/libtvm.so(+0x93332b) [0x7f28e67ec32b]
[bt] (7) /home/tvm/build/libtvm.so(tvm::ir::IRMutator::Mutate(HalideIR::Internal::Stmt)+0x47) [0x7f28e6839427]
[bt] (8) /home//tvm/build/libtvm.so(tvm::ir::IRMutator::Mutate_(HalideIR::Internal::Block const*, HalideIR::Internal::Stmt const&)+0x76) [0x7f28e6973416]
[bt] (9) /home//tvm/build/libtvm.so(+0xab8ac0) [0x7f28e6971ac0]