I met this question when i tried conv2d with extreme big channels.
x = relay.var("x", shape=[1, 256, 100, 100])
y = relay.var("y", shape=[65536, 1, 3, 3])
conv = relay.nn.conv2d(x, y, strides=(1, 1), padding=(1, 1), kernel_size=(3, 3), groups=256)
func = relay.Function((x, y), conv)
mod = tvm.IRModule.from_expr(func)
Then I got this:
TVMError: Check failed: ret == 0 (-1 vs. 0) : CUDALaunch Error: CUDA_ERROR_INVALID_VALUE
grid=(10000,65536,1), block=(1,1,1)
// func_name=fused_nn_conv2d_kernel0
// CUDA Source
// -----------
extern "C" __global__ void fused_nn_conv2d_kernel0(float* __restrict__ placeholder, float* __restrict__ placeholder1, float* __restrict__ compute) {
float compute_local[1];
__shared__ float pad_temp_shared[1];
__shared__ float placeholder_shared[1];
compute_local[(0)] = 0.000000e+00f;
for (int ry_outer = 0; ry_outer < 3; ++ry_outer) {
for (int rx_outer = 0; rx_outer < 3; ++rx_outer) {
pad_temp_shared[(0)] = (((((1 <= ((((int)blockIdx.x) / 100) + ry_outer)) && (((((int)blockIdx.x) / 100) + ry_outer) < 101)) && (1 <= (rx_outer + (((int)blockIdx.x) % 100)))) && ((rx_outer + (((int)blockIdx.x) % 100)) < 101)) ? placeholder[(((((((((int)blockIdx.y) >> 8) * 10000) + (ry_outer * 100)) + ((int)blockIdx.x)) + rx_outer) - 101))] : 0.000000e+00f);
placeholder_shared[(0)] = placeholder1[((((((int)blockIdx.y) * 9) + (ry_outer * 3)) + rx_outer))];
compute_local[(0)] = (compute_local[(0)] + (pad_temp_shared[(0)] * placeholder_shared[(0)]));
}
}
compute[(((((int)blockIdx.y) * 10000) + ((int)blockIdx.x)))] = compute_local[(0)];
}
I can’t avoid this group convolution in another big program.
Does anyone know how to solve this problem?Help, plzzzzzz