Hi, I want to test conv2d_grad on GPU, but got some error.
I added this test case in “def test_conv2d_grad()” in /tvm/tests/python/relay/test_op_grad_level2.py
verify_conv2d_grad((1, 736, 17, 17), (128, 736, 1, 1), [1, 1], [0, 0], [1, 1])
when I ran it on GPU, error occurred, this is the error log
Cannot find config for target=cuda, workload=(‘conv2d’, (1, 736, 17, 17, ‘float32’), (128, 736, 1, 1, ‘float32’), (1, 1), (0, 0), (1, 1), ‘NCHW’, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=cuda, workload=(‘conv2d_transpose_nchw’, (1, 128, 17, 17, ‘float32’), (128, 736, 1, 1, ‘float32’), (1, 1), (0, 0), ‘float32’). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=cuda, workload=(‘group_conv2d_nchw’, (1, 736, 17, 17, ‘float32’), (94208, 1, 17, 17, ‘float32’), (1, 1), (0, 0), (1, 1), 736, ‘float32’). A fallback configuration is used, which may bring great performance regression.
Traceback (most recent call last):File “/tvm/tests/python/relay/test_op_grad_level2.py”, line 132, in
test_conv2d_grad()File “/tvm/tests/python/relay/test_op_grad_level2.py”, line 125, in test_conv2d_grad
verify_conv2d_grad((1, 736, 17, 17), (128, 736, 1, 1), [1, 1], [0, 0], [1, 1])File “/tvm/tests/python/relay/test_op_grad_level2.py”, line 113, in verify_conv2d_grad
op_res, (grad_input, grad_weight) = intrp.evaluate(bwd_func)(data, weight)File “/tvm/python/tvm/relay/backend/interpreter.py”, line 316, in _interp_wrapper
return _intrp(opt_expr)File “/tvm/python/tvm/_ffi/_ctypes/function.py”, line 210, in call
raise get_last_ffi_error()tvm._ffi.base.TVMError: Traceback (most recent call last):
[bt] (6) /tvm/build/libtvm.so(TVMFuncCall+0x95) [0x7f5ddc183d8a]
[bt] (5) /tvm/build/libtvm.so(tvm::runtime::PackedFunc::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x30) [0x7f5ddb810ecc]
[bt] (4) /tvm/build/libtvm.so(std::function<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x5a) [0x7f5ddb6a82c2]
[bt] (3) /tvm/build/libtvm.so(std::Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), tvm::runtime::detail::PackFuncVoidAddr<4, tvm::runtime::CUDAWrappedFunc>(tvm::runtime::CUDAWrappedFunc, std::vector<tvm::runtime::detail::ArgConvertCode, std::allocatortvm::runtime::detail::ArgConvertCode > const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x58) [0x7f5ddc2051d2]
[bt] (2) /tvm/build/libtvm.so(tvm::runtime::detail::PackFuncVoidAddr<4, tvm::runtime::CUDAWrappedFunc>(tvm::runtime::CUDAWrappedFunc, std::vector<tvm::runtime::detail::ArgConvertCode, std::allocatortvm::runtime::detail::ArgConvertCode > const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const+0x23d) [0x7f5ddc2030db]
[bt] (1) /tvm/build/libtvm.so(tvm::runtime::CUDAWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*, void**) const+0x5ae) [0x7f5ddc20167c]
[bt] (0) /tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x34) [0x7f5ddb62af2c]
File “/tvm/src/runtime/cuda/cuda_module.cc”, line 215
File “/tvm/src/runtime/module_util.cc”, line 73
TVMError: Check failed: ret == 0 (-1 vs. 0) : CUDALaunch Error: CUDA_ERROR_INVALID_VALUE
grid=(1,94208,1), block=(1,1,1)
// func_name=fused_nn_conv2d_1_kernel0
// CUDA Source
// -----------
extern “C” global void fused_nn_conv2d_1_kernel0( float* restrict placeholder, float* restrict placeholder1, float* restrict compute) {
float compute_local[1];
shared float pad_temp_shared[1];
shared float placeholder_shared[1];
compute_local[0] = 0.000000e+00f;
for (int ry_outer = 0; ry_outer < 17; ++ry_outer) {
for (int rx_outer = 0; rx_outer < 17; ++rx_outer) {
pad_temp_shared[0] = placeholder[((((((int)blockIdx.y) / 128) * 289) + (ry_outer * 17)) + rx_outer)];
placeholder_shared[0] = placeholder1[(((((int)blockIdx.y) * 289) + (ry_outer * 17)) + rx_outer)];
compute_local[0] = (compute_local[0] + (pad_temp_shared[0] * placeholder_shared[0]));
}
}
compute[((int)blockIdx.y)] = compute_local[0];
}
Process finished with exit code 1
It works fine with CPU.
@ vinx13 Could you give me some suggestions?
or Could anyone give me some suggestions? Thanks very much