I am writing a matrix multiply example, just use block tricks to accelerate:
def matmul(N, L, M, dtype):
A = tvm.placeholder((N, L), name='A', dtype=dtype)
B = tvm.placeholder((L, M), name='B', dtype=dtype)
k = tvm.reduce_axis((0, L), name='k')
C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
s = tvm.create_schedule(C.op)
# schedule
y, x = s[C].op.axis
k = s[C].op.reduce_axis[0]
# factor change here
yo, yi = s[C].split(y, 32)
xo, xi = s[C].split(x, 32)
s[C].bind(yo, tvm.thread_axis("blockIdx.x"))
s[C].bind(yi, tvm.thread_axis("threadIdx.x"))
s[C].bind(xo, tvm.thread_axis("blockIdx.y"))
s[C].bind(xi, tvm.thread_axis("threadIdx.y"))
return s, [A, B, C]
after building the function, and I get the following cuda code:
extern "C" __global__ void default_function_kernel0( float* __restrict__ C, float* __restrict__ A, float* __restrict__ B) {
C[((((((int)blockIdx.x) * 8192) + (((int)threadIdx.x) * 256)) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))] = 0.000000e+00f;
for (int k = 0; k < 128; ++k) {
C[((((((int)blockIdx.x) * 8192) + (((int)threadIdx.x) * 256)) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))] = (C[((((((int)blockIdx.x) * 8192) + (((int)threadIdx.x) * 256)) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))] + (A[(((((int)blockIdx.x) * 4096) + (((int)threadIdx.x) * 128)) + k)] * B[(((k * 256) + (((int)blockIdx.y) * 32)) + ((int)threadIdx.y))]));
}
}
However, I think it might be better if reorder blockIdx.y
and threadIdx.x
, so, I add the reorder
after binding the thread_axis:
def matmul(N, L, M, dtype):
A = tvm.placeholder((N, L), name='A', dtype=dtype)
B = tvm.placeholder((L, M), name='B', dtype=dtype)
k = tvm.reduce_axis((0, L), name='k')
C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
s = tvm.create_schedule(C.op)
# schedule
y, x = s[C].op.axis
k = s[C].op.reduce_axis[0]
# factor change here
yo, yi = s[C].split(y, 32)
xo, xi = s[C].split(x, 32)
s[C].bind(yo, tvm.thread_axis("blockIdx.x"))
s[C].bind(yi, tvm.thread_axis("threadIdx.x"))
s[C].bind(xo, tvm.thread_axis("blockIdx.y"))
s[C].bind(xi, tvm.thread_axis("threadIdx.y"))
# add reorder here
s[C].reorder(yo, xo, k, yi, xi)
print(tvm.lower(s, [A, B, C], simple_mode=True))
print("*" * 30)
return s, [A, B, C]
now, when I build the function, I occur the following error:
Traceback (most recent call last):
File "/home/nishome/jjzhou/tvm/nnvm/tutorials/matmul.py", line 37, in <module>
fcuda = tvm.build(s, data, "cuda")
File "/home/nishome/jjzhou/tvm/python/tvm/build_module.py", line 569, in build
binds=binds)
File "/home/nishome/jjzhou/tvm/python/tvm/build_module.py", line 412, in lower
return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
File "tvm/_ffi/_cython/./function.pxi", line 310, in tvm._ffi._cy3.core.FunctionBase.__call__
File "tvm/_ffi/_cython/./function.pxi", line 255, in tvm._ffi._cy3.core.FuncCall
File "tvm/_ffi/_cython/./base.pxi", line 170, in tvm._ffi._cy3.core.CALL
tvm._ffi.base.TVMError: Traceback (most recent call last):
[bt] (3) /home/nishome/jjzhou/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7f78b5fa68c1]
[bt] (2) /home/nishome/jjzhou/tvm/build/libtvm.so(+0x884bc0) [0x7f78b57f5bc0]
[bt] (1) /home/nishome/jjzhou/tvm/build/libtvm.so(+0xad778f) [0x7f78b5a4878f]
[bt] (0) /home/nishome/jjzhou/tvm/build/libtvm.so(+0x842ca2) [0x7f78b57b3ca2]
File "/home/nishome/jjzhou/tvm/src/pass/make_api.cc", line 188
TVMError: Not all Vars are passed in api_args: 'i.inner.init' 'j.inner.init' does not appeared in api_args
and tvm.lower
get :
produce C {
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 16
// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 8
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 32
// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
C[((((((blockIdx.x*32) + i.inner.init)*8) + blockIdx.y)*32) + j.inner.init)] = 0.000000f
for (k, 0, 128) {
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 32
// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 32
C[((((((blockIdx.x*32) + threadIdx.x)*8) + blockIdx.y)*32) + threadIdx.y)] = (C[((((((blockIdx.x*32) + threadIdx.x)*8) + blockIdx.y)*32) + threadIdx.y)] + (A[((((blockIdx.x*32) + threadIdx.x)*128) + k)]*B[((((k*8) + blockIdx.y)*32) + threadIdx.y)]))
}
}
I don’t know why the code will produce i.inner.int
and j.inner.init
, who can tell me what are they?Thanks!