TVM seems to fail when the number of cuda kernels to build is very large.
Minimum reproducible:
import tvm
from tvm.contrib import cc, tar
import numpy as _np
import topi
m = tvm.var('m')
n = tvm.var('n')
dtype = 'float32'
ctx = tvm.context('cuda', 0)
num_thread = 64
def add_gpu():
a = tvm.placeholder((m, n), name='a', dtype=dtype)
b = tvm.placeholder((m, n), name='b', dtype=dtype)
c = tvm.compute((m, n), lambda i, j: a[i, j] + b[i, j], name='c')
s = tvm.create_schedule(c.op)
c_list = [c]
for t in c_list:
block_x = tvm.thread_axis("blockIdx.x")
thread_x = tvm.thread_axis("threadIdx.x")
axes = [axis for axis in t.op.axis]
fused = s[t].fuse(*axes)
bx, tx = s[t].split(fused, factor=num_thread)
s[t].bind(bx, block_x)
s[t].bind(tx, thread_x)
return s, [a, b, c]
s, args = add_gpu()
stmt = tvm.lower(s, args, simple_mode=True, name='add_cpu')
print(stmt)
print('build 1 time...')
func_list_cuda = [tvm.lower(s, args, name='add_gpu')]
lowered_funcs = {"cuda": func_list_cuda}
func_binary_cuda = tvm.build(lowered_funcs, name='lib')
# test
a = tvm.nd.array(_np.array([[1, 2], [3, 4]], dtype=dtype), ctx=ctx)
b = tvm.nd.array(_np.array([[5, 6], [7, 8]], dtype=dtype), ctx=ctx)
c = tvm.nd.array(_np.zeros((2, 2), dtype=dtype), ctx=ctx)
func_binary_cuda['add_gpu'](a, b, c)
print(c)
print('build 50000 times...')
print('lower...')
func_list_cuda = [tvm.lower(s, args, name='add_cpu_{}'.format(i)) for i in range(50000)]
lowered_funcs = {"cuda": func_list_cuda}
print('build...')
func_binary_cuda = tvm.build(lowered_funcs, name='lib')
The output is
produce c {
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = floordiv(((m*n) + 63), 64)
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 64
if (likely((floordiv(((blockIdx.x*64) + threadIdx.x), n) < m))) {
if (likely((floormod(((blockIdx.x*64) + threadIdx.x), n) < n))) {
if (likely((((blockIdx.x*64) + threadIdx.x) < (m*n)))) {
if (likely((0 <= floordiv(((blockIdx.x*64) + threadIdx.x), n)))) {
if (likely((floordiv(((blockIdx.x*64) + threadIdx.x), n) < m))) {
if (likely((0 <= floormod(((blockIdx.x*64) + threadIdx.x), n)))) {
if (likely((floormod(((blockIdx.x*64) + threadIdx.x), n) < n))) {
c[((floordiv(((blockIdx.x*64) + threadIdx.x), n)*stride) + (floormod(((blockIdx.x*64) + threadIdx.x), n)*stride))] = (a[((floordiv(((blockIdx.x*64) + threadIdx.x), n)*stride) + (floormod(((blockIdx.x*64) + threadIdx.x), n)*stride))] + b[((floordiv(((blockIdx.x*64) + threadIdx.x), n)*stride) + (floormod(((blockIdx.x*64) + threadIdx.x), n)*stride))])
}
}
}
}
}
}
}
}
build 1 time...
[[ 6. 8.]
[10. 12.]]
build 50000 times...
lower...
build...
Traceback (most recent call last):
File "test.py", line 57, in <module>
func_binary_cuda = tvm.build(lowered_funcs, name='lib')
File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/build_module.py", line 637, in build
fhost, mdev = _build_for_device(flist, tar, target_host)
File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/build_module.py", line 503, in _build_for_device
mdev = codegen.build_module(fdevice, str(target)) if fdevice else None
File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/codegen.py", line 36, in build_module
return _Build(lowered_func, target)
File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/_ffi/_ctypes/function.py", line 207, in __call__
raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
[bt] (5) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(TVMFuncCall+0x61) [0x7f0107e0c071]
[bt] (4) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(+0xa81b0e) [0x7f0107650b0e]
[bt] (3) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::codegen::Build(tvm::Array<tvm::LoweredFunc, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1e3) [0x7f0107769683]
[bt] (2) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::Array<tvm::LoweredFunc, void>)>::AssignTypedLambda<tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>)>(tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x4e) [0x7f010779630e]
[bt] (1) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(tvm::codegen::BuildCUDA(tvm::Array<tvm::LoweredFunc, void>)+0x402) [0x7f0107dad622]
[bt] (0) /home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/libtvm.so(+0x12387bb) [0x7f0107e077bb]
File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/_ffi/_ctypes/function.py", line 72, in cfun
rv = local_pyfunc(*pyargs)
File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/autotvm/measure/measure_methods.py", line 585, in tvm_callback_cuda_compile
ptx = nvcc.compile_cuda(code, target="ptx", arch=AutotvmGlobalScope.current.cuda_target_arch)
File "/home/ubuntu/anaconda3/envs/mx/lib/python3.7/site-packages/tvm-0.6.dev0-py3.7-linux-x86_64.egg/tvm/contrib/nvcc.py", line 98, in compile_cuda
raise RuntimeError(msg)
RuntimeError: Compilation error:
<unnamed>: parse Invalid record
Can we compile them kernel by kernel?