I’m trying to create a simple schedule that uses blocking and local buffers. See below.
import argparse
import os
import tvm
from tvm import te
import numpy as np
# define tensor operation: matrix-vector multiply C=A*B
A = te.placeholder(shape=(672,14,14), name='A')
B = te.placeholder(shape=(672,), name='B')
C = te.compute(A.shape, lambda i,j,k: A[i,j,k] * B[i], name='C')
print(C)
s = te.create_schedule([C.op])
# input tensors
a = s[C].op.input_tensors[0]
b = s[C].op.input_tensors[1]
# local input buffer for a
aa = s.cache_read(a, "local", s[C].op)
# local output buffer for c
#cc : te.tensor.Tensor = s.cache_write(C, "local")
# blocking on channel axis: 672x14x14 --> 84x(8x14x14)
(io, ii) = s[C].split(s[C].op.axis[0], factor=8)
# sink copy-in into loop nest
s[aa].compute_at(s[C], io)
# additional passes once above works
if 0:
# sink copy-out into loop nest
s[cc].compute_at(s[C], io)
# double buffer a-->aa input copies
s[aa].double_buffer()
# double buffer cc-->c output copies
s[cc].double_buffer()
fname = "main"
irmod = tvm.driver.build_module.form_irmodule(s, [A, B, C], name=fname, binds=None)
print(irmod)
But I can’t get the output buffer to work. If I uncomment the line with the “local output buffer” comment, I get this internal error:
Check failed: found_attach || stage_attach.size() == 0 == false: Invalid Schedule, cannot find the producer compute(A.local, body=[A[ax0, ax1, ax2]], axis=[iter_var(ax0, range(min=0, ext=672)), iter_var(ax1, range(min=0, ext=14)), iter_var(ax2, range(min=0, ext=14))], reduce_axis=[], tag=, attrs={}) along the loop nest specified by compute_at of consumer compute(C.local, body=[(A.local[i.c, j.c, k.c]*B[i.c])], axis=[iter_var(i.c, range(min=0, ext=672)), iter_var(j.c, range(min=0, ext=14)), iter_var(k.c, range(min=0, ext=14))], reduce_axis=[], tag=, attrs={})
It also works if I enable the cache_write, but disable the blocking and compute_at. I cannot figure out what’s going on. Any suggestions?
-Alan